From 7d0642b93780a7309d2954de6f6126d6ceb526f0 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 11 Jul 2012 15:03:18 -0400 Subject: [PATCH 01/48] xen/perf: Define .glob for the different hypercalls. This allows us in perf to have this: 99.67% [kernel] [k] xen_hypercall_sched_op 0.11% [kernel] [k] xen_hypercall_xen_version instead of the borring ever-encompassing: 99.13% [kernel] [k] hypercall_page [v2: Use a macro to define the name and skip] [v3: Use balign per Jan's suggestion] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/xen-head.S | 56 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index aaa7291c9259..7faed5869e5b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -28,9 +28,61 @@ ENTRY(startup_xen) __FINIT .pushsection .text - .align PAGE_SIZE + .balign PAGE_SIZE ENTRY(hypercall_page) - .skip PAGE_SIZE +#define NEXT_HYPERCALL(x) \ + ENTRY(xen_hypercall_##x) \ + .skip 32 + +NEXT_HYPERCALL(set_trap_table) +NEXT_HYPERCALL(mmu_update) +NEXT_HYPERCALL(set_gdt) +NEXT_HYPERCALL(stack_switch) +NEXT_HYPERCALL(set_callbacks) +NEXT_HYPERCALL(fpu_taskswitch) +NEXT_HYPERCALL(sched_op_compat) +NEXT_HYPERCALL(platform_op) +NEXT_HYPERCALL(set_debugreg) +NEXT_HYPERCALL(get_debugreg) +NEXT_HYPERCALL(update_descriptor) +NEXT_HYPERCALL(ni) +NEXT_HYPERCALL(memory_op) +NEXT_HYPERCALL(multicall) +NEXT_HYPERCALL(update_va_mapping) +NEXT_HYPERCALL(set_timer_op) +NEXT_HYPERCALL(event_channel_op_compat) +NEXT_HYPERCALL(xen_version) +NEXT_HYPERCALL(console_io) +NEXT_HYPERCALL(physdev_op_compat) +NEXT_HYPERCALL(grant_table_op) +NEXT_HYPERCALL(vm_assist) +NEXT_HYPERCALL(update_va_mapping_otherdomain) +NEXT_HYPERCALL(iret) +NEXT_HYPERCALL(vcpu_op) +NEXT_HYPERCALL(set_segment_base) +NEXT_HYPERCALL(mmuext_op) +NEXT_HYPERCALL(xsm_op) +NEXT_HYPERCALL(nmi_op) +NEXT_HYPERCALL(sched_op) +NEXT_HYPERCALL(callback_op) +NEXT_HYPERCALL(xenoprof_op) +NEXT_HYPERCALL(event_channel_op) +NEXT_HYPERCALL(physdev_op) +NEXT_HYPERCALL(hvm_op) +NEXT_HYPERCALL(sysctl) +NEXT_HYPERCALL(domctl) +NEXT_HYPERCALL(kexec_op) +NEXT_HYPERCALL(tmem_op) /* 38 */ +ENTRY(xen_hypercall_rsvr) + .skip 320 +NEXT_HYPERCALL(mca) /* 48 */ +NEXT_HYPERCALL(arch_1) +NEXT_HYPERCALL(arch_2) +NEXT_HYPERCALL(arch_3) +NEXT_HYPERCALL(arch_4) +NEXT_HYPERCALL(arch_5) +NEXT_HYPERCALL(arch_6) + .balign PAGE_SIZE .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") From a3118beb6a8cbe77ae3342125d920205871b0717 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 28 Jun 2012 22:12:36 -0400 Subject: [PATCH 02/48] xen/p2m: Fix the comment describing the P2M tree. It mixed up the p2m_mid_missing with p2m_missing. Also remove some extra spaces. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 64effdc6da94..e4adbfbdfada 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -22,7 +22,7 @@ * * P2M_PER_PAGE depends on the architecture, as a mfn is always * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to - * 512 and 1024 entries respectively. + * 512 and 1024 entries respectively. * * In short, these structures contain the Machine Frame Number (MFN) of the PFN. * @@ -139,11 +139,11 @@ * / | ~0, ~0, .... | * | \---------------/ * | - * p2m_missing p2m_missing - * /------------------\ /------------\ - * | [p2m_mid_missing]+---->| ~0, ~0, ~0 | - * | [p2m_mid_missing]+---->| ..., ~0 | - * \------------------/ \------------/ + * p2m_mid_missing p2m_missing + * /-----------------\ /------------\ + * | [p2m_missing] +---->| ~0, ~0, ~0 | + * | [p2m_missing] +---->| ..., ~0 | + * \-----------------/ \------------/ * * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) */ @@ -423,7 +423,7 @@ static void free_p2m_page(void *p) free_page((unsigned long)p); } -/* +/* * Fully allocate the p2m structure for a given pfn. We need to check * that both the top and mid levels are allocated, and make sure the * parallel mfn tree is kept in sync. We may race with other cpus, so From 59b294403e9814e7c1154043567f0d71bac7a511 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 19 Jul 2012 10:23:47 -0400 Subject: [PATCH 03/48] xen/x86: Use memblock_reserve for sensitive areas. instead of a big memblock_reserve. This way we can be more selective in freeing regions (and it also makes it easier to understand where is what). [v1: Move the auto_translate_physmap to proper line] [v2: Per Stefano suggestion add more comments] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 48 ++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/p2m.c | 5 +++++ arch/x86/xen/setup.c | 9 -------- 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ff962d4b821e..e532eb50e8d7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -998,7 +998,54 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } +/* + * If the MFN is not in the m2p (provided to us by the hypervisor) this + * function won't do anything. In practice this means that the XenBus + * MFN won't be available for the initial domain. */ +static void __init xen_reserve_mfn(unsigned long mfn) +{ + unsigned long pfn; + if (!mfn) + return; + pfn = mfn_to_pfn(mfn); + if (phys_to_machine_mapping_valid(pfn)) + memblock_reserve(PFN_PHYS(pfn), PAGE_SIZE); +} +static void __init xen_reserve_internals(void) +{ + unsigned long size; + + if (!xen_pv_domain()) + return; + + /* xen_start_info does not exist in the M2P, hence can't use + * xen_reserve_mfn. */ + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + + xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)); + xen_reserve_mfn(xen_start_info->store_mfn); + + if (!xen_initial_domain()) + xen_reserve_mfn(xen_start_info->console.domU.mfn); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + /* + * ALIGN up to compensate for the p2m_page pointing to an array that + * can partially filled (look in xen_build_dynamic_phys_to_machine). + */ + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + /* We could use xen_reserve_mfn here, but would end up looping quite + * a lot (and call memblock_reserve for each PAGE), so lets just use + * the easy way and reserve it wholesale. */ + memblock_reserve(__pa(xen_start_info->mfn_list), size); + + /* The pagetables are reserved in mmu.c */ +} void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1362,6 +1409,7 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + xen_reserve_internals(); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index e4adbfbdfada..6a2bfa43c8a1 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -388,6 +388,11 @@ void __init xen_build_dynamic_phys_to_machine(void) } m2p_override_init(); + + /* NOTE: We cannot call memblock_reserve here for the mfn_list as there + * isn't enough pieces to make it work (for one - we are still using the + * Xen provided pagetable). Do it later in xen_reserve_internals. + */ } unsigned long get_phys_to_machine(unsigned long pfn) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a4790bf22c59..9efca750405d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -424,15 +424,6 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); - /* - * Reserve Xen bits: - * - mfn_list - * - xen_start_info - * See comment above "struct start_info" in - */ - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return "Xen"; From 806c312e50f122c47913145cf884f53dd09d9199 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 21 Aug 2012 14:31:24 -0400 Subject: [PATCH 04/48] xen/x86: Workaround 64-bit hypervisor and 32-bit initial domain. If a 64-bit hypervisor is booted with a 32-bit initial domain, the hypervisor deals with the initial domain as "compat" and does some extra adjustments (like pagetables are 4 bytes instead of 8). It also adjusts the xen_start_info->pt_base incorrectly. When booted with a 32-bit hypervisor (32-bit initial domain): .. (XEN) Start info: cf831000->cf83147c (XEN) Page tables: cf832000->cf8b5000 .. [ 0.000000] PT: cf832000 (f832000) [ 0.000000] Reserving PT: f832000->f8b5000 And with a 64-bit hypervisor: (XEN) Start info: 00000000cf831000->00000000cf8314b4 (XEN) Page tables: 00000000cf832000->00000000cf8b6000 [ 0.000000] PT: cf834000 (f834000) [ 0.000000] Reserving PT: f834000->f8b8000 To deal with this, we keep keep track of the highest physical address we have reserved via memblock_reserve. If that address does not overlap with pt_base, we have a gap which we reserve. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e532eb50e8d7..511f92d79e4a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1002,19 +1002,24 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) * If the MFN is not in the m2p (provided to us by the hypervisor) this * function won't do anything. In practice this means that the XenBus * MFN won't be available for the initial domain. */ -static void __init xen_reserve_mfn(unsigned long mfn) +static unsigned long __init xen_reserve_mfn(unsigned long mfn) { - unsigned long pfn; + unsigned long pfn, end_pfn = 0; if (!mfn) - return; + return end_pfn; + pfn = mfn_to_pfn(mfn); - if (phys_to_machine_mapping_valid(pfn)) - memblock_reserve(PFN_PHYS(pfn), PAGE_SIZE); + if (phys_to_machine_mapping_valid(pfn)) { + end_pfn = PFN_PHYS(pfn) + PAGE_SIZE; + memblock_reserve(PFN_PHYS(pfn), end_pfn); + } + return end_pfn; } static void __init xen_reserve_internals(void) { unsigned long size; + unsigned long last_phys = 0; if (!xen_pv_domain()) return; @@ -1022,12 +1027,13 @@ static void __init xen_reserve_internals(void) /* xen_start_info does not exist in the M2P, hence can't use * xen_reserve_mfn. */ memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + last_phys = __pa(xen_start_info) + PAGE_SIZE; - xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)); - xen_reserve_mfn(xen_start_info->store_mfn); + last_phys = max(xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)), last_phys); + last_phys = max(xen_reserve_mfn(xen_start_info->store_mfn), last_phys); if (!xen_initial_domain()) - xen_reserve_mfn(xen_start_info->console.domU.mfn); + last_phys = max(xen_reserve_mfn(xen_start_info->console.domU.mfn), last_phys); if (xen_feature(XENFEAT_auto_translated_physmap)) return; @@ -1043,8 +1049,14 @@ static void __init xen_reserve_internals(void) * a lot (and call memblock_reserve for each PAGE), so lets just use * the easy way and reserve it wholesale. */ memblock_reserve(__pa(xen_start_info->mfn_list), size); - + last_phys = max(__pa(xen_start_info->mfn_list) + size, last_phys); /* The pagetables are reserved in mmu.c */ + + /* Under 64-bit hypervisor with a 32-bit domain, the hypervisor + * offsets the pt_base by two pages. Hence the reservation that is done + * in mmu.c misses two pages. We correct it here if we detect this. */ + if (last_phys < __pa(xen_start_info->pt_base)) + memblock_reserve(last_phys, __pa(xen_start_info->pt_base) - last_phys); } void xen_setup_shared_info(void) { From 988f0e24bbcbbf550dff016faf8273a94f4eb1af Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 27 Jul 2012 20:10:58 -0400 Subject: [PATCH 05/48] xen/swiotlb: Simplify the logic. Its pretty easy: 1). We only check to see if we need Xen SWIOTLB for PV guests. 2). If swiotlb=force or iommu=soft is set, then Xen SWIOTLB will be enabled. 3). If it is an initial domain, then Xen SWIOTLB will be enabled. 4). Native SWIOTLB must be disabled for PV guests. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/pci-swiotlb-xen.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 967633ad98c4..b6a534002ab2 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -34,19 +34,20 @@ static struct dma_map_ops xen_swiotlb_dma_ops = { int __init pci_xen_swiotlb_detect(void) { + if (!xen_pv_domain()) + return 0; + /* If running as PV guest, either iommu=soft, or swiotlb=force will * activate this IOMMU. If running as PV privileged, activate it * irregardless. */ - if ((xen_initial_domain() || swiotlb || swiotlb_force) && - (xen_pv_domain())) + if ((xen_initial_domain() || swiotlb || swiotlb_force)) xen_swiotlb = 1; /* If we are running under Xen, we MUST disable the native SWIOTLB. * Don't worry about swiotlb_force flag activating the native, as * the 'swiotlb' flag is the only one turning it on. */ - if (xen_pv_domain()) - swiotlb = 0; + swiotlb = 0; return xen_swiotlb; } From fc2341df9e31be8a3940f4e302372d7ef46bab8c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 27 Jul 2012 20:16:00 -0400 Subject: [PATCH 06/48] xen/swiotlb: With more than 4GB on 64-bit, disable the native SWIOTLB. If a PV guest is booted the native SWIOTLB should not be turned on. It does not help us (we don't have any PCI devices) and it eats 64MB of good memory. In the case of PV guests with PCI devices we need the Xen-SWIOTLB one. [v1: Rewrite comment per Stefano's suggestion] Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/pci-swiotlb-xen.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index b6a534002ab2..1c1722761eec 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -8,6 +8,11 @@ #include #include +#ifdef CONFIG_X86_64 +#include +#include +#endif + int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { @@ -49,6 +54,15 @@ int __init pci_xen_swiotlb_detect(void) * the 'swiotlb' flag is the only one turning it on. */ swiotlb = 0; +#ifdef CONFIG_X86_64 + /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0 + * (so no iommu=X command line over-writes). + * Considering that PV guests do not want the *native SWIOTLB* but + * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here. + */ + if (max_pfn > MAX_DMA32_PFN) + no_iommu = 1; +#endif return xen_swiotlb; } From 74838b75379a53678ffc5f59de86161d21e2c808 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 27 Jul 2012 20:55:27 -0400 Subject: [PATCH 07/48] swiotlb: add the late swiotlb initialization function with iotlb memory This enables the caller to initialize swiotlb with its own iotlb memory late in the bootup. See git commit eb605a5754d050a25a9f00d718fb173f24c486ef "swiotlb: add swiotlb_tbl_map_single library function" which will explain the full details of what it can be used for. CC: FUJITA Tomonori [v1: Fold in smatch warning] Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 1 + lib/swiotlb.c | 33 ++++++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index e872526fdc5f..8d08b3ed406d 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -25,6 +25,7 @@ extern int swiotlb_force; extern void swiotlb_init(int verbose); extern void swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); extern unsigned long swiotlb_nr_tbl(void); +extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); /* * Enumeration for sync targets diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 45bc1f83a5ad..f114bf6a8e13 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -170,7 +170,7 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. */ -void __init +static void __init swiotlb_init_with_default_size(size_t default_size, int verbose) { unsigned long bytes; @@ -206,8 +206,9 @@ swiotlb_init(int verbose) int swiotlb_late_init_with_default_size(size_t default_size) { - unsigned long i, bytes, req_nslabs = io_tlb_nslabs; + unsigned long bytes, req_nslabs = io_tlb_nslabs; unsigned int order; + int rc = 0; if (!io_tlb_nslabs) { io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); @@ -229,16 +230,32 @@ swiotlb_late_init_with_default_size(size_t default_size) order--; } - if (!io_tlb_start) - goto cleanup1; - + if (!io_tlb_start) { + io_tlb_nslabs = req_nslabs; + return -ENOMEM; + } if (order != get_order(bytes)) { printk(KERN_WARNING "Warning: only able to allocate %ld MB " "for software IO TLB\n", (PAGE_SIZE << order) >> 20); io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = io_tlb_nslabs << IO_TLB_SHIFT; } + rc = swiotlb_late_init_with_tbl(io_tlb_start, io_tlb_nslabs); + if (rc) + free_pages((unsigned long)io_tlb_start, order); + return rc; +} + +int +swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) +{ + unsigned long i, bytes; + + bytes = nslabs << IO_TLB_SHIFT; + + io_tlb_nslabs = nslabs; + io_tlb_start = tlb; io_tlb_end = io_tlb_start + bytes; + memset(io_tlb_start, 0, bytes); /* @@ -288,10 +305,8 @@ swiotlb_late_init_with_default_size(size_t default_size) io_tlb_list = NULL; cleanup2: io_tlb_end = NULL; - free_pages((unsigned long)io_tlb_start, order); io_tlb_start = NULL; -cleanup1: - io_tlb_nslabs = req_nslabs; + io_tlb_nslabs = 0; return -ENOMEM; } From b58aaa4b0b3506c094308342d746f600468c63d9 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 6 Aug 2012 15:27:24 +0100 Subject: [PATCH 08/48] xen: update xen_add_to_physmap interface Update struct xen_add_to_physmap to be in sync with Xen's version of the structure. The size field was introduced by: changeset: 24164:707d27fe03e7 user: Jean Guyader date: Fri Nov 18 13:42:08 2011 +0000 summary: mm: New XENMEM space, XENMAPSPACE_gmfn_range According to the comment: "This new field .size is located in the 16 bits padding between .domid and .space in struct xen_add_to_physmap to stay compatible with older versions." Changes in v2: - remove erroneous comment in the commit message. Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/interface/memory.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h index eac3ce153719..8d4efc1cc64a 100644 --- a/include/xen/interface/memory.h +++ b/include/xen/interface/memory.h @@ -163,6 +163,9 @@ struct xen_add_to_physmap { /* Which domain to change the mapping for. */ domid_t domid; + /* Number of pages to go through for gmfn_range */ + uint16_t size; + /* Source mapping space. */ #define XENMAPSPACE_shared_info 0 /* shared info page */ #define XENMAPSPACE_grant_table 1 /* grant table page */ From 4d9310e39728a87c86eb48492da7546f61189633 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 6 Aug 2012 15:27:09 +0100 Subject: [PATCH 09/48] xen: missing includes Changes in v2: - remove pvclock hack; - remove include linux/types.h from xen/interface/xen.h. v3: - Compile under IA64 Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/ia64/include/asm/xen/interface.h | 2 ++ arch/x86/include/asm/xen/interface.h | 2 ++ drivers/tty/hvc/hvc_xen.c | 2 ++ drivers/xen/grant-table.c | 1 + drivers/xen/xenbus/xenbus_probe_frontend.c | 1 + include/xen/interface/xen.h | 1 - include/xen/privcmd.h | 1 + 7 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h index 09d5f7fd9db1..ee9cad6e749b 100644 --- a/arch/ia64/include/asm/xen/interface.h +++ b/arch/ia64/include/asm/xen/interface.h @@ -265,6 +265,8 @@ typedef struct xen_callback xen_callback_t; #endif /* !__ASSEMBLY__ */ +#include + /* Size of the shared_info area (this is not related to page size). */ #define XSI_SHIFT 14 #define XSI_SIZE (1 << XSI_SHIFT) diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index cbf0c9d50b92..a93db16e9582 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -121,6 +121,8 @@ struct arch_shared_info { #include "interface_64.h" #endif +#include + #ifndef __ASSEMBLY__ /* * The following is all CPU context. Note that the fpu_ctxt block is filled diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 944eaeb8e0cf..dc07f56d66b5 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 0bfc1ef11259..1d0d95e5b446 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index a31b54d48839..3159a37d966d 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -21,6 +21,7 @@ #include #include #include +#include #include diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index a890804945e3..3871e4753680 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -10,7 +10,6 @@ #define __XEN_PUBLIC_XEN_H__ #include -#include /* * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h index 17857fb4d550..4d588814510b 100644 --- a/include/xen/privcmd.h +++ b/include/xen/privcmd.h @@ -35,6 +35,7 @@ #include #include +#include typedef unsigned long xen_pfn_t; From b8b0f559c7b1dcf5503817e518c81c9a18ee45e0 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 21 Aug 2012 14:49:34 -0400 Subject: [PATCH 10/48] xen/apic/xenbus/swiotlb/pcifront/grant/tmem: Make functions or variables static. There is no need for those functions/variables to be visible. Make them static and also fix the compile warnings of this sort: drivers/xen/.c: warning: symbol '' was not declared. Should it be static? Some of them just require including the header file that declares the functions. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/apic.c | 3 ++- arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/pci-swiotlb-xen.c | 1 + arch/x86/xen/platform-pci-unplug.c | 1 + drivers/pci/xen-pcifront.c | 2 +- drivers/xen/gntdev.c | 2 +- drivers/xen/grant-table.c | 13 ++++++------- drivers/xen/swiotlb-xen.c | 2 +- drivers/xen/tmem.c | 1 + drivers/xen/xenbus/xenbus_dev_backend.c | 2 +- drivers/xen/xenbus/xenbus_probe.c | 4 ++-- 11 files changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index ec57bd3818a4..7005ced5d1ad 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -6,8 +6,9 @@ #include #include +#include "xen-ops.h" -unsigned int xen_io_apic_read(unsigned apic, unsigned reg) +static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { struct physdev_apic apic_op; int ret; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ff962d4b821e..cb1b1914dbd3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -79,6 +79,8 @@ #include "smp.h" #include "multicalls.h" +#include + EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 967633ad98c4..2d58b3ff4fae 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -8,6 +8,7 @@ #include #include +#include int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index ffcf2615640b..0a7852483ffe 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -24,6 +24,7 @@ #include #include +#include "xen-ops.h" #define XEN_PLATFORM_ERR_MAGIC -1 #define XEN_PLATFORM_ERR_PROTOCOL -2 diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index d6cc62cb4cf7..a4d901def8f2 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -236,7 +236,7 @@ static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn, return errno_to_pcibios_err(do_pci_op(pdev, &op)); } -struct pci_ops pcifront_bus_ops = { +static struct pci_ops pcifront_bus_ops = { .read = pcifront_bus_read, .write = pcifront_bus_write, }; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1ffd03bf8e10..163b7e985ed0 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -445,7 +445,7 @@ static void mn_release(struct mmu_notifier *mn, spin_unlock(&priv->lock); } -struct mmu_notifier_ops gntdev_mmu_ops = { +static struct mmu_notifier_ops gntdev_mmu_ops = { .release = mn_release, .invalidate_page = mn_invl_page, .invalidate_range_start = mn_invl_range_start, diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 1d0d95e5b446..eea81cf8a2a5 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -286,10 +286,9 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, - unsigned length) +static void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, unsigned length) { gnttab_shared.v2[ref].sub_page.frame = frame; gnttab_shared.v2[ref].sub_page.page_off = page_off; @@ -346,9 +345,9 @@ bool gnttab_subpage_grants_available(void) } EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available); -void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid, - int flags, domid_t trans_domid, - grant_ref_t trans_gref) +static void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid, + int flags, domid_t trans_domid, + grant_ref_t trans_gref) { gnttab_shared.v2[ref].transitive.trans_domid = trans_domid; gnttab_shared.v2[ref].transitive.gref = trans_gref; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 1afb4fba11b4..a52f3ae05d94 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -52,7 +52,7 @@ static unsigned long xen_io_tlb_nslabs; * Quick lookup value of the bus address of the IOTLB. */ -u64 start_dma_addr; +static u64 start_dma_addr; static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index 89f264c67420..144564e5eb29 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -21,6 +21,7 @@ #include #include #include +#include #define TMEM_CONTROL 0 #define TMEM_NEW_POOL 1 diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c index be738c43104b..d73000800762 100644 --- a/drivers/xen/xenbus/xenbus_dev_backend.c +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -107,7 +107,7 @@ static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -const struct file_operations xenbus_backend_fops = { +static const struct file_operations xenbus_backend_fops = { .open = xenbus_backend_open, .mmap = xenbus_backend_mmap, .unlocked_ioctl = xenbus_backend_ioctl, diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index b793723e724d..91d3d6544a7b 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -324,8 +324,8 @@ static int cmp_dev(struct device *dev, void *data) return 0; } -struct xenbus_device *xenbus_device_find(const char *nodename, - struct bus_type *bus) +static struct xenbus_device *xenbus_device_find(const char *nodename, + struct bus_type *bus) { struct xb_find_info info = { .dev = NULL, .nodename = nodename }; From 5c13f8067745efc15f6ad0158b58d57c44104c25 Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Thu, 16 Aug 2012 16:40:26 -0400 Subject: [PATCH 11/48] xen/sysfs: Use XENVER_guest_handle to query UUID This hypercall has been present since Xen 3.1, and is the preferred method for a domain to obtain its UUID. Fall back to the xenstore method if using an older version of Xen (which returns -ENOSYS). Signed-off-by: Daniel De Graaf Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/sys-hypervisor.c | 13 ++++++++++++- include/xen/interface/version.h | 3 +++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index fdb6d229c9bb..5e5ad7e28858 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -114,7 +114,7 @@ static void xen_sysfs_version_destroy(void) /* UUID */ -static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +static ssize_t uuid_show_fallback(struct hyp_sysfs_attr *attr, char *buffer) { char *vm, *val; int ret; @@ -135,6 +135,17 @@ static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) return ret; } +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + xen_domain_handle_t uuid; + int ret; + ret = HYPERVISOR_xen_version(XENVER_guest_handle, uuid); + if (ret) + return uuid_show_fallback(attr, buffer); + ret = sprintf(buffer, "%pU\n", uuid); + return ret; +} + HYPERVISOR_ATTR_RO(uuid); static int __init xen_sysfs_uuid_init(void) diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h index e8b6519d47e9..dd58cf5ea3e4 100644 --- a/include/xen/interface/version.h +++ b/include/xen/interface/version.h @@ -60,4 +60,7 @@ struct xen_feature_info { /* arg == NULL; returns host memory page size. */ #define XENVER_pagesize 7 +/* arg == xen_domain_handle_t. */ +#define XENVER_guest_handle 8 + #endif /* __XEN_PUBLIC_VERSION_H__ */ From b5e579232d635b79a3da052964cb357ccda8d9ea Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 22 Aug 2012 17:20:11 +0100 Subject: [PATCH 12/48] xen/events: fix unmask_evtchn for PV on HVM guests When unmask_evtchn is called, if we already have an event pending, we just set evtchn_pending_sel waiting for local_irq_enable to be called. That is because PV guests set the irq_enable pvops to xen_irq_enable_direct in xen_setup_vcpu_info_placement: xen_irq_enable_direct is implemented in assembly in arch/x86/xen/xen-asm.S and call xen_force_evtchn_callback if XEN_vcpu_info_pending is set. However HVM guests (and ARM guests) do not change or do not have the irq_enable pvop, so evtchn_unmask cannot work properly for them. Considering that having the pending_irq bit set when unmask_evtchn is called is not very common, and it is simpler to keep the native_irq_enable implementation for HVM guests (and ARM guests), the best thing to do is just use the EVTCHNOP_unmask hypercall (Xen re-injects pending events in response). Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 7595581d032c..36bf17dc7bfc 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -373,11 +373,22 @@ static void unmask_evtchn(int port) { struct shared_info *s = HYPERVISOR_shared_info; unsigned int cpu = get_cpu(); + int do_hypercall = 0, evtchn_pending = 0; BUG_ON(!irqs_disabled()); - /* Slow path (hypercall) if this is a non-local port. */ - if (unlikely(cpu != cpu_from_evtchn(port))) { + if (unlikely((cpu != cpu_from_evtchn(port)))) + do_hypercall = 1; + else + evtchn_pending = sync_test_bit(port, &s->evtchn_pending[0]); + + if (unlikely(evtchn_pending && xen_hvm_domain())) + do_hypercall = 1; + + /* Slow path (hypercall) if this is a non-local port or if this is + * an hvm domain and an event is pending (hvm domains don't have + * their own implementation of irq_enable). */ + if (do_hypercall) { struct evtchn_unmask unmask = { .port = port }; (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); } else { @@ -390,7 +401,7 @@ static void unmask_evtchn(int port) * 'hw_resend_irq'. Just like a real IO-APIC we 'lose * the interrupt edge' if the channel is masked. */ - if (sync_test_bit(port, &s->evtchn_pending[0]) && + if (evtchn_pending && !sync_test_and_set_bit(port / BITS_PER_LONG, &vcpu_info->evtchn_pending_sel)) vcpu_info->evtchn_upcall_pending = 1; From a8636c0b2e57d4f31f71aa306b1ee701db3f3c85 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 22 Aug 2012 17:20:15 +0100 Subject: [PATCH 13/48] xen: clear IRQ_NOAUTOEN and IRQ_NOREQUEST Reset the IRQ_NOAUTOEN and IRQ_NOREQUEST flags that are enabled by default on ARM. If IRQ_NOAUTOEN is set, __setup_irq doesn't call irq_startup, that is responsible for calling irq_unmask at startup time. As a result event channels remain masked. Signed-off-by: Stefano Stabellini Acked-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 36bf17dc7bfc..c60d1629c916 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -842,6 +842,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_EVTCHN); } + irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); out: mutex_unlock(&irq_mapping_update_lock); From bd3f79b71de0410352ab506496a467fcb0620912 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 22 Aug 2012 17:20:14 +0100 Subject: [PATCH 14/48] xen: Introduce xen_pfn_t for pfn and mfn types All the original Xen headers have xen_pfn_t as mfn and pfn type, however when they have been imported in Linux, xen_pfn_t has been replaced with unsigned long. That might work for x86 and ia64 but it does not for arm. Bring back xen_pfn_t and let each architecture define xen_pfn_t as they see fit. Signed-off-by: Stefano Stabellini Acked-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- arch/ia64/include/asm/xen/interface.h | 5 ++++- arch/x86/include/asm/xen/interface.h | 5 +++++ include/xen/interface/grant_table.h | 4 ++-- include/xen/interface/memory.h | 6 +++--- include/xen/interface/platform.h | 4 ++-- include/xen/interface/xen.h | 6 +++--- include/xen/privcmd.h | 2 -- 7 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h index ee9cad6e749b..3d52a5bbd857 100644 --- a/arch/ia64/include/asm/xen/interface.h +++ b/arch/ia64/include/asm/xen/interface.h @@ -67,6 +67,10 @@ #define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0) #ifndef __ASSEMBLY__ +/* Explicitly size integers that represent pfns in the public interface + * with Xen so that we could have one ABI that works for 32 and 64 bit + * guests. */ +typedef unsigned long xen_pfn_t; /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); @@ -79,7 +83,6 @@ DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); DEFINE_GUEST_HANDLE(uint32_t); -typedef unsigned long xen_pfn_t; DEFINE_GUEST_HANDLE(xen_pfn_t); #define PRI_xen_pfn "lx" #endif diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index a93db16e9582..555f94d3637b 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -47,6 +47,10 @@ #endif #ifndef __ASSEMBLY__ +/* Explicitly size integers that represent pfns in the public interface + * with Xen so that on ARM we can have one ABI that works for 32 and 64 + * bit guests. */ +typedef unsigned long xen_pfn_t; /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); @@ -57,6 +61,7 @@ DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); DEFINE_GUEST_HANDLE(uint32_t); +DEFINE_GUEST_HANDLE(xen_pfn_t); #endif #ifndef HYPERVISOR_VIRT_START diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h index a17d84433e6a..7da811bdd558 100644 --- a/include/xen/interface/grant_table.h +++ b/include/xen/interface/grant_table.h @@ -338,7 +338,7 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table); #define GNTTABOP_transfer 4 struct gnttab_transfer { /* IN parameters. */ - unsigned long mfn; + xen_pfn_t mfn; domid_t domid; grant_ref_t ref; /* OUT parameters. */ @@ -375,7 +375,7 @@ struct gnttab_copy { struct { union { grant_ref_t ref; - unsigned long gmfn; + xen_pfn_t gmfn; } u; domid_t domid; uint16_t offset; diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h index 8d4efc1cc64a..d8e33a93ea4d 100644 --- a/include/xen/interface/memory.h +++ b/include/xen/interface/memory.h @@ -31,7 +31,7 @@ struct xen_memory_reservation { * OUT: GMFN bases of extents that were allocated * (NB. This command also updates the mach_to_phys translation table) */ - GUEST_HANDLE(ulong) extent_start; + GUEST_HANDLE(xen_pfn_t) extent_start; /* Number of extents, and size/alignment of each (2^extent_order pages). */ unsigned long nr_extents; @@ -130,7 +130,7 @@ struct xen_machphys_mfn_list { * any large discontiguities in the machine address space, 2MB gaps in * the machphys table will be represented by an MFN base of zero. */ - GUEST_HANDLE(ulong) extent_start; + GUEST_HANDLE(xen_pfn_t) extent_start; /* * Number of extents written to the above array. This will be smaller @@ -175,7 +175,7 @@ struct xen_add_to_physmap { unsigned long idx; /* GPFN where the source mapping page should appear. */ - unsigned long gpfn; + xen_pfn_t gpfn; }; DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap); diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h index 486653f0dd8f..0bea47027fa2 100644 --- a/include/xen/interface/platform.h +++ b/include/xen/interface/platform.h @@ -54,7 +54,7 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t); #define XENPF_add_memtype 31 struct xenpf_add_memtype { /* IN variables. */ - unsigned long mfn; + xen_pfn_t mfn; uint64_t nr_mfns; uint32_t type; /* OUT variables. */ @@ -84,7 +84,7 @@ struct xenpf_read_memtype { /* IN variables. */ uint32_t reg; /* OUT variables. */ - unsigned long mfn; + xen_pfn_t mfn; uint64_t nr_mfns; uint32_t type; }; diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 3871e4753680..42834a36d345 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -188,7 +188,7 @@ struct mmuext_op { unsigned int cmd; union { /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */ - unsigned long mfn; + xen_pfn_t mfn; /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ unsigned long linear_addr; } arg1; @@ -428,11 +428,11 @@ struct start_info { unsigned long nr_pages; /* Total pages allocated to this domain. */ unsigned long shared_info; /* MACHINE address of shared info struct. */ uint32_t flags; /* SIF_xxx flags. */ - unsigned long store_mfn; /* MACHINE page number of shared page. */ + xen_pfn_t store_mfn; /* MACHINE page number of shared page. */ uint32_t store_evtchn; /* Event channel for store communication. */ union { struct { - unsigned long mfn; /* MACHINE page number of console page. */ + xen_pfn_t mfn; /* MACHINE page number of console page. */ uint32_t evtchn; /* Event channel for console page. */ } domU; struct { diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h index 4d588814510b..45c1aa14b83d 100644 --- a/include/xen/privcmd.h +++ b/include/xen/privcmd.h @@ -37,8 +37,6 @@ #include #include -typedef unsigned long xen_pfn_t; - struct privcmd_hypercall { __u64 op; __u64 arg[5]; From 1a1d43318aeb74d679372c0b65029957be274529 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 22 Aug 2012 17:20:16 +0100 Subject: [PATCH 15/48] xen: allow privcmd for HVM guests This patch removes the "return -ENOSYS" for auto_translated_physmap guests from privcmd_mmap, thus it allows ARM guests to issue privcmd mmap calls. However privcmd mmap calls are still going to fail for HVM and hybrid guests on x86 because the xen_remap_domain_mfn_range implementation is currently PV only. Changes in v2: - better commit message; - return -EINVAL from xen_remap_domain_mfn_range if auto_translated_physmap. Signed-off-by: Stefano Stabellini Acked-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 3 +++ drivers/xen/privcmd.c | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785631ce..885a22354a96 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2310,6 +2310,9 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, unsigned long range; int err = 0; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -EINVAL; + prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index ccee0f16bcf8..85226cbeca24 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -380,10 +380,6 @@ static struct vm_operations_struct privcmd_vm_ops = { static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { - /* Unsupported for auto-translate guests. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return -ENOSYS; - /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP; From a8752fd9a4106c5efe324109df133692d5fcbffc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 13 Aug 2012 13:26:11 -0400 Subject: [PATCH 16/48] xen/swiotlb: Remove functions not needed anymore. Sparse warns us off: drivers/xen/swiotlb-xen.c:506:1: warning: symbol 'xen_swiotlb_map_sg' was not declared. Should it be static? drivers/xen/swiotlb-xen.c:534:1: warning: symbol 'xen_swiotlb_unmap_sg' was not declared. Should it be static? and it looks like we do not need this function at all. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 16 ---------------- include/xen/swiotlb-xen.h | 9 --------- 2 files changed, 25 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index a52f3ae05d94..29e8cd48180f 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -466,14 +466,6 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, } EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); -int -xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg); - /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules * concerning calls here are the same as for swiotlb_unmap_page() above. @@ -494,14 +486,6 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); -void -xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg); - /* * Make physical memory consistent for a set of streaming mode DMA translations * after a transfer. diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index 4f4d449f00f6..c050b6b9de38 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -23,15 +23,6 @@ extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, extern void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs); -/* -extern int -xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, - enum dma_data_direction dir); - -extern void -xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, - enum dma_data_direction dir); -*/ extern int xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir, From 6d7083eee3bc088d1fc30eefabd6263bca40c95a Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 13 Aug 2012 11:00:08 -0400 Subject: [PATCH 17/48] xen/swiotlb: Fix compile warnings when using plain integer instead of NULL pointer. arch/x86/xen/pci-swiotlb-xen.c:96:1: warning: Using plain integer as NULL pointer arch/x86/xen/pci-swiotlb-xen.c:96:1: warning: Using plain integer as NULL pointer Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/pci-swiotlb-xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 2d58b3ff4fae..1ab45941502d 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -63,6 +63,6 @@ void __init pci_xen_swiotlb_init(void) } } IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, - 0, + NULL, pci_xen_swiotlb_init, - 0); + NULL); From 51faaf2b0d5c7f44d82964f0c70b1c4e44d4e633 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 22 Aug 2012 13:00:10 -0400 Subject: [PATCH 18/48] Revert "xen/x86: Workaround 64-bit hypervisor and 32-bit initial domain." and "xen/x86: Use memblock_reserve for sensitive areas." This reverts commit 806c312e50f122c47913145cf884f53dd09d9199 and commit 59b294403e9814e7c1154043567f0d71bac7a511. And also documents setup.c and why we want to do it that way, which is that we tried to make the the memblock_reserve more selective so that it would be clear what region is reserved. Sadly we ran in the problem wherein on a 64-bit hypervisor with a 32-bit initial domain, the pt_base has the cr3 value which is not neccessarily where the pagetable starts! As Jan put it: " Actually, the adjustment turns out to be correct: The page tables for a 32-on-64 dom0 get allocated in the order "first L1", "first L2", "first L3", so the offset to the page table base is indeed 2. When reading xen/include/public/xen.h's comment very strictly, this is not a violation (since there nothing is said that the first thing in the page table space is pointed to by pt_base; I admit that this seems to be implied though, namely do I think that it is implied that the page table space is the range [pt_base, pt_base + nt_pt_frames), whereas that range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), which - without a priori knowledge - the kernel would have difficulty to figure out)." - so lets just fall back to the easy way and reserve the whole region. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 60 ---------------------------------------- arch/x86/xen/p2m.c | 5 ---- arch/x86/xen/setup.c | 27 ++++++++++++++++++ 3 files changed, 27 insertions(+), 65 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 511f92d79e4a..ff962d4b821e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -998,66 +998,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } -/* - * If the MFN is not in the m2p (provided to us by the hypervisor) this - * function won't do anything. In practice this means that the XenBus - * MFN won't be available for the initial domain. */ -static unsigned long __init xen_reserve_mfn(unsigned long mfn) -{ - unsigned long pfn, end_pfn = 0; - if (!mfn) - return end_pfn; - - pfn = mfn_to_pfn(mfn); - if (phys_to_machine_mapping_valid(pfn)) { - end_pfn = PFN_PHYS(pfn) + PAGE_SIZE; - memblock_reserve(PFN_PHYS(pfn), end_pfn); - } - return end_pfn; -} -static void __init xen_reserve_internals(void) -{ - unsigned long size; - unsigned long last_phys = 0; - - if (!xen_pv_domain()) - return; - - /* xen_start_info does not exist in the M2P, hence can't use - * xen_reserve_mfn. */ - memblock_reserve(__pa(xen_start_info), PAGE_SIZE); - last_phys = __pa(xen_start_info) + PAGE_SIZE; - - last_phys = max(xen_reserve_mfn(PFN_DOWN(xen_start_info->shared_info)), last_phys); - last_phys = max(xen_reserve_mfn(xen_start_info->store_mfn), last_phys); - - if (!xen_initial_domain()) - last_phys = max(xen_reserve_mfn(xen_start_info->console.domU.mfn), last_phys); - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - /* - * ALIGN up to compensate for the p2m_page pointing to an array that - * can partially filled (look in xen_build_dynamic_phys_to_machine). - */ - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - - /* We could use xen_reserve_mfn here, but would end up looping quite - * a lot (and call memblock_reserve for each PAGE), so lets just use - * the easy way and reserve it wholesale. */ - memblock_reserve(__pa(xen_start_info->mfn_list), size); - last_phys = max(__pa(xen_start_info->mfn_list) + size, last_phys); - /* The pagetables are reserved in mmu.c */ - - /* Under 64-bit hypervisor with a 32-bit domain, the hypervisor - * offsets the pt_base by two pages. Hence the reservation that is done - * in mmu.c misses two pages. We correct it here if we detect this. */ - if (last_phys < __pa(xen_start_info->pt_base)) - memblock_reserve(last_phys, __pa(xen_start_info->pt_base) - last_phys); -} void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1421,7 +1362,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); - xen_reserve_internals(); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 6a2bfa43c8a1..e4adbfbdfada 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -388,11 +388,6 @@ void __init xen_build_dynamic_phys_to_machine(void) } m2p_override_init(); - - /* NOTE: We cannot call memblock_reserve here for the mfn_list as there - * isn't enough pieces to make it work (for one - we are still using the - * Xen provided pagetable). Do it later in xen_reserve_internals. - */ } unsigned long get_phys_to_machine(unsigned long pfn) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9efca750405d..740517be4da5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -424,6 +424,33 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); + /* + * Reserve Xen bits: + * - mfn_list + * - xen_start_info + * See comment above "struct start_info" in + * We tried to make the the memblock_reserve more selective so + * that it would be clear what region is reserved. Sadly we ran + * in the problem wherein on a 64-bit hypervisor with a 32-bit + * initial domain, the pt_base has the cr3 value which is not + * neccessarily where the pagetable starts! As Jan put it: " + * Actually, the adjustment turns out to be correct: The page + * tables for a 32-on-64 dom0 get allocated in the order "first L1", + * "first L2", "first L3", so the offset to the page table base is + * indeed 2. When reading xen/include/public/xen.h's comment + * very strictly, this is not a violation (since there nothing is said + * that the first thing in the page table space is pointed to by + * pt_base; I admit that this seems to be implied though, namely + * do I think that it is implied that the page table space is the + * range [pt_base, pt_base + nt_pt_frames), whereas that + * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), + * which - without a priori knowledge - the kernel would have + * difficulty to figure out)." - so lets just fall back to the + * easy way and reserve the whole region. + */ + memblock_reserve(__pa(xen_start_info->mfn_list), + xen_start_info->pt_base - xen_start_info->mfn_list); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return "Xen"; From 3699aad047e16a5775b1d051425f422a9384270d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 28 Jun 2012 22:47:35 -0400 Subject: [PATCH 19/48] xen/mmu: The xen_setup_kernel_pagetable doesn't need to return anything. We don't need to return the new PGD - as we do not use it. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 5 +---- arch/x86/xen/mmu.c | 10 ++-------- arch/x86/xen/xen-ops.h | 2 +- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ff962d4b821e..d87a038a0484 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1258,7 +1258,6 @@ asmlinkage void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; int rc; - pgd_t *pgd; if (!xen_start_info) return; @@ -1350,8 +1349,6 @@ asmlinkage void __init xen_start_kernel(void) acpi_numa = -1; #endif - pgd = (pgd_t *)xen_start_info->pt_base; - /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; @@ -1360,7 +1357,7 @@ asmlinkage void __init xen_start_kernel(void) early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); - pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785631ce..4ac21a4c6da4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1719,8 +1719,7 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; pmd_t *l2; @@ -1781,8 +1780,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, memblock_reserve(__pa(xen_start_info->pt_base), xen_start_info->nr_pt_frames * PAGE_SIZE); - - return pgd; } #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); @@ -1825,8 +1822,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } -pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) +void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; @@ -1858,8 +1854,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, memblock_reserve(__pa(xen_start_info->pt_base), xen_start_info->nr_pt_frames * PAGE_SIZE); - - return initial_page_table; } #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 202d4c150154..2230f57a6ebe 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -27,7 +27,7 @@ void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); -pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); +void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; From 4fac153a7a260e40e10008a0d7a272719684e4cd Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 12 Jul 2012 13:55:25 -0400 Subject: [PATCH 20/48] xen/mmu: Provide comments describing the _ka and _va aliasing issue Which is that the level2_kernel_pgt (__ka virtual addresses) and level2_ident_pgt (__va virtual address) contain the same PMD entries. So if you modify a PTE in __ka, it will be reflected in __va (and vice-versa). Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4ac21a4c6da4..6ba610098dd9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1734,19 +1734,36 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) init_level4_pgt[0] = __pgd(0); /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt + * L4[511] -> level3_kernel_pgt */ convert_pfn_mfn(init_level4_pgt); + + /* L3_i[0] -> level2_ident_pgt */ convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt + * L3_i[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); + /* We get [511][511] and have Xen's version of level2_kernel_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: + * Both L4[272][0] and L4[511][511] have entries that point to the same + * L2 (PMD) tables. Meaning that if you modify it in __va space + * it will be also modified in the __ka space! (But if you just + * modify the PMD table to point to other PTE's or none, then you + * are OK - which is what cleanup_highmap does) */ memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + /* Graft it onto L4[511][511] */ memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + /* Get [511][510] and graft that in level2_fixmap_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + /* Note that we don't do anything with level1_fixmap_pgt which + * we don't need. */ /* Set up identity map */ xen_map_identity_early(level2_ident_pgt, max_pfn); From ae895ed7839f918bbc8d5425b8973b25a534f4eb Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Jul 2012 11:57:04 -0400 Subject: [PATCH 21/48] xen/mmu: use copy_page instead of memcpy. After all, this is what it is there for. Acked-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 6ba610098dd9..7247e5a62f27 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1754,14 +1754,14 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * it will be also modified in the __ka space! (But if you just * modify the PMD table to point to other PTE's or none, then you * are OK - which is what cleanup_highmap does) */ - memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(level2_ident_pgt, l2); /* Graft it onto L4[511][511] */ - memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(level2_kernel_pgt, l2); /* Get [511][510] and graft that in level2_fixmap_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); - memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(level2_fixmap_pgt, l2); /* Note that we don't do anything with level1_fixmap_pgt which * we don't need. */ @@ -1821,8 +1821,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) */ swapper_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - memcpy(swapper_kernel_pmd, initial_kernel_pmd, - sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(swapper_kernel_pmd, initial_kernel_pmd); swapper_pg_dir[KERNEL_PGD_BOUNDARY] = __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); @@ -1851,11 +1850,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 512*1024); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + copy_page(initial_kernel_pmd, kernel_pmd); xen_map_identity_early(initial_kernel_pmd, max_pfn); - memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + copy_page(initial_page_table, pgd); initial_page_table[KERNEL_PGD_BOUNDARY] = __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); From caaf9ecf16feffa4f1a5a0d617bc78906a114514 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 12 Jul 2012 13:59:36 -0400 Subject: [PATCH 22/48] xen/mmu: For 64-bit do not call xen_map_identity_early B/c we do not need it. During the startup the Xen provides us with all the initial memory mapped that we need to function. The initial memory mapped is up to the bootstack, which means we can reference using __ka up to 4.f): (from xen/interface/xen.h): 4. This the order of bootstrap elements in the initial virtual region: a. relocated kernel image b. initial ram disk [mod_start, mod_len] c. list of allocated page frames [mfn_list, nr_pages] d. start_info_t structure [register ESI (x86)] e. bootstrap page tables [pt_base, CR3 (x86)] f. bootstrap stack [register ESP (x86)] (initial ram disk may be ommitted). [v1: More comments in git commit] Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7247e5a62f27..a59070b09055 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -84,6 +84,7 @@ */ DEFINE_SPINLOCK(xen_reservation_lock); +#ifdef CONFIG_X86_32 /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -91,7 +92,7 @@ DEFINE_SPINLOCK(xen_reservation_lock); */ #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); - +#endif #ifdef CONFIG_X86_64 /* l3 pud for userspace vsyscall mapping */ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; @@ -1628,7 +1629,7 @@ static void set_page_prot(void *addr, pgprot_t prot) if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } - +#ifdef CONFIG_X86_32 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; @@ -1679,7 +1680,7 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) set_page_prot(pmd, PAGE_KERNEL_RO); } - +#endif void __init xen_setup_machphys_mapping(void) { struct xen_machphys_mapping mapping; @@ -1765,14 +1766,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Note that we don't do anything with level1_fixmap_pgt which * we don't need. */ - /* Set up identity map */ - xen_map_identity_early(level2_ident_pgt, max_pfn); - /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); From 488f046df922af992c1a718eff276529c0510885 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Jul 2012 12:00:56 -0400 Subject: [PATCH 23/48] xen/mmu: Recycle the Xen provided L4, L3, and L2 pages As we are not using them. We end up only using the L1 pagetables and grafting those to our page-tables. [v1: Per Stefano's suggestion squashed two commits] [v2: Per Stefano's suggestion simplified loop] [v3: Fix smatch warnings] [v4: Add more comments] Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a59070b09055..b44e6a88ea74 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1708,7 +1708,20 @@ static void convert_pfn_mfn(void *v) for (i = 0; i < PTRS_PER_PTE; i++) pte[i] = xen_make_pte(pte[i].pte); } - +static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, + unsigned long addr) +{ + if (*pt_base == PFN_DOWN(__pa(addr))) { + set_page_prot((void *)addr, PAGE_KERNEL); + clear_page((void *)addr); + (*pt_base)++; + } + if (*pt_end == PFN_DOWN(__pa(addr))) { + set_page_prot((void *)addr, PAGE_KERNEL); + clear_page((void *)addr); + (*pt_end)--; + } +} /* * Set up the initial kernel pagetable. * @@ -1724,6 +1737,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; pmd_t *l2; + unsigned long addr[3]; + unsigned long pt_base, pt_end; + unsigned i; /* max_pfn_mapped is the last pfn mapped in the initial memory * mappings. Considering that on Xen after the kernel mappings we @@ -1731,6 +1747,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * set max_pfn_mapped to the last real pfn mapped. */ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); + pt_end = pt_base + xen_start_info->nr_pt_frames; + /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); @@ -1749,6 +1768,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + addr[0] = (unsigned long)pgd; + addr[1] = (unsigned long)l3; + addr[2] = (unsigned long)l2; /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: * Both L4[272][0] and L4[511][511] have entries that point to the same * L2 (PMD) tables. Meaning that if you modify it in __va space @@ -1782,20 +1804,26 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - /* Switch over */ - pgd = init_level4_pgt; - /* * At this stage there can be no user pgd, and no page * structure to attach it to, so make sure we just set kernel * pgd. */ xen_mc_batch(); - __xen_write_cr3(true, __pa(pgd)); + __xen_write_cr3(true, __pa(init_level4_pgt)); xen_mc_issue(PARAVIRT_LAZY_CPU); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + /* We can't that easily rip out L3 and L2, as the Xen pagetables are + * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for + * the initial domain. For guests using the toolstack, they are in: + * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only + * rip out the [L4] (pgd), but for guests we shave off three pages. + */ + for (i = 0; i < ARRAY_SIZE(addr); i++) + check_pt_base(&pt_base, &pt_end, addr[i]); + + /* Our (by three pages) smaller Xen pagetable that we are using */ + memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); } #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); From 357a3cfb147ee8e97c6f9cdc51e9a33aa56f7d99 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 19 Jul 2012 13:52:29 -0400 Subject: [PATCH 24/48] xen/p2m: Add logic to revector a P2M tree to use __va leafs. During bootup Xen supplies us with a P2M array. It sticks it right after the ramdisk, as can be seen with a 128GB PV guest: (certain parts removed for clarity): xc_dom_build_image: called xc_dom_alloc_segment: kernel : 0xffffffff81000000 -> 0xffffffff81e43000 (pfn 0x1000 + 0xe43 pages) xc_dom_pfn_to_ptr: domU mapping: pfn 0x1000+0xe43 at 0x7f097d8bf000 xc_dom_alloc_segment: ramdisk : 0xffffffff81e43000 -> 0xffffffff925c7000 (pfn 0x1e43 + 0x10784 pages) xc_dom_pfn_to_ptr: domU mapping: pfn 0x1e43+0x10784 at 0x7f0952dd2000 xc_dom_alloc_segment: phys2mach : 0xffffffff925c7000 -> 0xffffffffa25c7000 (pfn 0x125c7 + 0x10000 pages) xc_dom_pfn_to_ptr: domU mapping: pfn 0x125c7+0x10000 at 0x7f0942dd2000 xc_dom_alloc_page : start info : 0xffffffffa25c7000 (pfn 0x225c7) xc_dom_alloc_page : xenstore : 0xffffffffa25c8000 (pfn 0x225c8) xc_dom_alloc_page : console : 0xffffffffa25c9000 (pfn 0x225c9) nr_page_tables: 0x0000ffffffffffff/48: 0xffff000000000000 -> 0xffffffffffffffff, 1 table(s) nr_page_tables: 0x0000007fffffffff/39: 0xffffff8000000000 -> 0xffffffffffffffff, 1 table(s) nr_page_tables: 0x000000003fffffff/30: 0xffffffff80000000 -> 0xffffffffbfffffff, 1 table(s) nr_page_tables: 0x00000000001fffff/21: 0xffffffff80000000 -> 0xffffffffa27fffff, 276 table(s) xc_dom_alloc_segment: page tables : 0xffffffffa25ca000 -> 0xffffffffa26e1000 (pfn 0x225ca + 0x117 pages) xc_dom_pfn_to_ptr: domU mapping: pfn 0x225ca+0x117 at 0x7f097d7a8000 xc_dom_alloc_page : boot stack : 0xffffffffa26e1000 (pfn 0x226e1) xc_dom_build_image : virt_alloc_end : 0xffffffffa26e2000 xc_dom_build_image : virt_pgtab_end : 0xffffffffa2800000 So the physical memory and virtual (using __START_KERNEL_map addresses) layout looks as so: phys __ka /------------\ /-------------------\ | 0 | empty | 0xffffffff80000000| | .. | | .. | | 16MB | <= kernel starts | 0xffffffff81000000| | .. | | | | 30MB | <= kernel ends => | 0xffffffff81e43000| | .. | & ramdisk starts | .. | | 293MB | <= ramdisk ends=> | 0xffffffff925c7000| | .. | & P2M starts | .. | | .. | | .. | | 549MB | <= P2M ends => | 0xffffffffa25c7000| | .. | start_info | 0xffffffffa25c7000| | .. | xenstore | 0xffffffffa25c8000| | .. | cosole | 0xffffffffa25c9000| | 549MB | <= page tables => | 0xffffffffa25ca000| | .. | | | | 550MB | <= PGT end => | 0xffffffffa26e1000| | .. | boot stack | | \------------/ \-------------------/ As can be seen, the ramdisk, P2M and pagetables are taking a bit of __ka addresses space. Which is a problem since the MODULES_VADDR starts at 0xffffffffa0000000 - and P2M sits right in there! This results during bootup with the inability to load modules, with this error: ------------[ cut here ]------------ WARNING: at /home/konrad/ssd/linux/mm/vmalloc.c:106 vmap_page_range_noflush+0x2d9/0x370() Call Trace: [] warn_slowpath_common+0x7a/0xb0 [] ? __raw_callee_save_xen_pmd_val+0x11/0x1e [] warn_slowpath_null+0x15/0x20 [] vmap_page_range_noflush+0x2d9/0x370 [] map_vm_area+0x2d/0x50 [] __vmalloc_node_range+0x160/0x250 [] ? module_alloc_update_bounds+0x19/0x80 [] ? load_module+0x66/0x19c0 [] module_alloc+0x5c/0x60 [] ? module_alloc_update_bounds+0x19/0x80 [] module_alloc_update_bounds+0x19/0x80 [] load_module+0xfa3/0x19c0 [] ? security_file_permission+0x86/0x90 [] sys_init_module+0x5a/0x220 [] system_call_fastpath+0x16/0x1b ---[ end trace fd8f7704fdea0291 ]--- vmalloc: allocation failure, allocated 16384 of 20480 bytes modprobe: page allocation failure: order:0, mode:0xd2 Since the __va and __ka are 1:1 up to MODULES_VADDR and cleanup_highmap rids __ka of the ramdisk mapping, what we want to do is similar - get rid of the P2M in the __ka address space. There are two ways of fixing this: 1) All P2M lookups instead of using the __ka address would use the __va address. This means we can safely erase from __ka space the PMD pointers that point to the PFNs for P2M array and be OK. 2). Allocate a new array, copy the existing P2M into it, revector the P2M tree to use that, and return the old P2M to the memory allocate. This has the advantage that it sets the stage for using XEN_ELF_NOTE_INIT_P2M feature. That feature allows us to set the exact virtual address space we want for the P2M - and allows us to boot as initial domain on large machines. So we pick option 2). This patch only lays the groundwork in the P2M code. The patch that modifies the MMU is called "xen/mmu: Copy and revector the P2M tree." Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 70 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 1 + 2 files changed, 71 insertions(+) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index e4adbfbdfada..996ee2bf7bdb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -389,7 +389,77 @@ void __init xen_build_dynamic_phys_to_machine(void) m2p_override_init(); } +#ifdef CONFIG_X86_64 +#include +unsigned long __init xen_revector_p2m_tree(void) +{ + unsigned long va_start; + unsigned long va_end; + unsigned long pfn; + unsigned long *mfn_list = NULL; + unsigned long size; + va_start = xen_start_info->mfn_list; + /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), + * so make sure it is rounded up to that */ + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + va_end = va_start + size; + + /* If we were revectored already, don't do it again. */ + if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) + return 0; + + mfn_list = alloc_bootmem_align(size, PAGE_SIZE); + if (!mfn_list) { + pr_warn("Could not allocate space for a new P2M tree!\n"); + return xen_start_info->mfn_list; + } + /* Fill it out with INVALID_P2M_ENTRY value */ + memset(mfn_list, 0xFF, size); + + for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx; + unsigned long *mid_p; + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + mid_p = p2m_top[topidx][mididx]; + if (!mid_p) + continue; + if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) + continue; + + if ((unsigned long)mid_p == INVALID_P2M_ENTRY) + continue; + + /* The old va. Rebase it on mfn_list */ + if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { + unsigned long *new; + + new = &mfn_list[pfn]; + + copy_page(new, mid_p); + p2m_top[topidx][mididx] = &mfn_list[pfn]; + p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn]); + + } + /* This should be the leafs allocated for identity from _brk. */ + } + return (unsigned long)mfn_list; + +} +#else +unsigned long __init xen_revector_p2m_tree(void) +{ + return 0; +} +#endif unsigned long get_phys_to_machine(unsigned long pfn) { unsigned topidx, mididx, idx; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2230f57a6ebe..bb5a8105ea86 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -45,6 +45,7 @@ void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); +unsigned long __init xen_revector_p2m_tree(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); From 7f9140626c757b773624b97865cb53c2a8348a69 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Jul 2012 12:47:40 -0400 Subject: [PATCH 25/48] xen/mmu: Copy and revector the P2M tree. Please first read the description in "xen/p2m: Add logic to revector a P2M tree to use __va leafs" patch. The 'xen_revector_p2m_tree()' function allocates a new P2M tree copies the contents of the old one in it, and returns the new one. At this stage, the __ka address space (which is what the old P2M tree was using) is partially disassembled. The cleanup_highmap has removed the PMD entries from 0-16MB and anything past _brk_end up to the max_pfn_mapped (which is the end of the ramdisk). We have revectored the P2M tree (and the one for save/restore as well) to use new shiny __va address to new MFNs. The xen_start_info has been taken care of already in 'xen_setup_kernel_pagetable()' and xen_start_info->shared_info in 'xen_setup_shared_info()', so we are free to roam and delete PMD entries - which is exactly what we are going to do. We rip out the __ka for the old P2M array. [v1: Fix smatch warnings] [v2: memset was doing 0 instead of 0xff] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b44e6a88ea74..a640949f78d4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1183,9 +1183,64 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) static void xen_post_allocator_init(void); +#ifdef CONFIG_X86_64 +static void __init xen_cleanhighmap(unsigned long vaddr, + unsigned long vaddr_end) +{ + unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); + + /* NOTE: The loop is more greedy than the cleanup_highmap variant. + * We include the PMD passed in on _both_ boundaries. */ + for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); + pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > kernel_end) + set_pmd(pmd, __pmd(0)); + } + /* In case we did something silly, we should crash in this function + * instead of somewhere later and be confusing. */ + xen_mc_flush(); +} +#endif static void __init xen_pagetable_setup_done(pgd_t *base) { +#ifdef CONFIG_X86_64 + unsigned long size; + unsigned long addr; +#endif + xen_setup_shared_info(); +#ifdef CONFIG_X86_64 + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long new_mfn_list; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + /* On 32-bit, we get zero so this never gets executed. */ + new_mfn_list = xen_revector_p2m_tree(); + if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + /* We should be in __ka space. */ + BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); + addr = xen_start_info->mfn_list; + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + /* We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or xen_start_info->shared_info + * they are in going to crash. Fortunatly we have already revectored + * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + size = roundup(size, PMD_SIZE); + xen_cleanhighmap(addr, addr + size); + + memblock_free(__pa(xen_start_info->mfn_list), size); + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = new_mfn_list; + } + } +#endif xen_post_allocator_init(); } @@ -1824,6 +1879,8 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Our (by three pages) smaller Xen pagetable that we are using */ memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + /* Revector the xen_start_info */ + xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); From 3aca7fbc8ede0dd194317b2e3144815128ffb232 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 14 Aug 2012 14:34:00 -0400 Subject: [PATCH 26/48] xen/mmu: Remove from __ka space PMD entries for pagetables. Please first read the description in "xen/mmu: Copy and revector the P2M tree." At this stage, the __ka address space (which is what the old P2M tree was using) is partially disassembled. The cleanup_highmap has removed the PMD entries from 0-16MB and anything past _brk_end up to the max_pfn_mapped (which is the end of the ramdisk). The xen_remove_p2m_tree and code around has ripped out the __ka for the old P2M array. Here we continue on doing it to where the Xen page-tables were. It is safe to do it, as the page-tables are addressed using __va. For good measure we delete anything that is within MODULES_VADDR and up to the end of the PMD. At this point the __ka only contains PMD entries for the start of the kernel up to __brk. [v1: Per Stefano's suggestion wrapped the MODULES_VADDR in debug] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a640949f78d4..3f8e963b76c0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1240,6 +1240,25 @@ static void __init xen_pagetable_setup_done(pgd_t *base) xen_start_info->mfn_list = new_mfn_list; } } + /* At this stage, cleanup_highmap has already cleaned __ka space + * from _brk_limit way up to the max_pfn_mapped (which is the end of + * the ramdisk). We continue on, erasing PMD entries that point to page + * tables - do note that they are accessible at this stage via __va. + * For good measure we also round up to the PMD - which means that if + * anybody is using __ka address to the initial boot-stack - and try + * to use it - they are going to crash. The xen_start_info has been + * taken care of already in xen_setup_kernel_pagetable. */ + addr = xen_start_info->pt_base; + size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + + xen_cleanhighmap(addr, addr + size); + xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); +#ifdef DEBUG + /* This is superflous and is not neccessary, but you know what + * lets do it. The MODULES_VADDR -> MODULES_END should be clear of + * anything at this stage. */ + xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); +#endif #endif xen_post_allocator_init(); } From 785f62314984ea3af9dd830b020289ba2509ae69 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 14 Aug 2012 16:37:31 -0400 Subject: [PATCH 27/48] xen/mmu: Release just the MFN list, not MFN list and part of pagetables. We call memblock_reserve for [start of mfn list] -> [PMD aligned end of mfn list] instead of -> --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f8e963b76c0..5b2cb54425ce 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1227,7 +1227,6 @@ static void __init xen_pagetable_setup_done(pgd_t *base) /* We should be in __ka space. */ BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); addr = xen_start_info->mfn_list; - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); /* We roundup to the PMD, which means that if anybody at this stage is * using the __ka address of xen_start_info or xen_start_info->shared_info * they are in going to crash. Fortunatly we have already revectored @@ -1235,6 +1234,7 @@ static void __init xen_pagetable_setup_done(pgd_t *base) size = roundup(size, PMD_SIZE); xen_cleanhighmap(addr, addr + size); + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); memblock_free(__pa(xen_start_info->mfn_list), size); /* And revector! Bye bye old array */ xen_start_info->mfn_list = new_mfn_list; From 3fc509fc0c590900568ef516a37101d88f3476f5 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 16 Aug 2012 16:38:55 -0400 Subject: [PATCH 28/48] xen/p2m: When revectoring deal with holes in the P2M array. When we free the PFNs and then subsequently populate them back during bootup: Freeing 20000-20200 pfn range: 512 pages freed 1-1 mapping on 20000->20200 Freeing 40000-40200 pfn range: 512 pages freed 1-1 mapping on 40000->40200 Freeing bad80-badf4 pfn range: 116 pages freed 1-1 mapping on bad80->badf4 Freeing badf6-bae7f pfn range: 137 pages freed 1-1 mapping on badf6->bae7f Freeing bb000-100000 pfn range: 282624 pages freed 1-1 mapping on bb000->100000 Released 283999 pages of unused memory Set 283999 page(s) to 1-1 mapping Populating 1acb8a-1f20e9 pfn range: 283999 pages added We end up having the P2M array (that is the one that was grafted on the P2M tree) filled with IDENTITY_FRAME or INVALID_P2M_ENTRY) entries. The patch titled "xen/p2m: Reuse existing P2M leafs if they are filled with 1:1 PFNs or INVALID." recycles said slots and replaces the P2M tree leaf's with &mfn_list[xx] with p2m_identity or p2m_missing. And re-uses the P2M array sections for other P2M tree leaf's. For the above mentioned bootup excerpt, the PFNs at 0x20000->0x20200 are going to be IDENTITY based: P2M[0][256][0] -> P2M[0][257][0] get turned in IDENTITY_FRAME. We can re-use that and replace P2M[0][256] to point to p2m_identity. The "old" page (the grafted P2M array provided by Xen) that was at P2M[0][256] gets put somewhere else. Specifically at P2M[6][358], b/c when we populate back: Populating 1acb8a-1f20e9 pfn range: 283999 pages added we fill P2M[6][358][0] (and P2M[6][358], P2M[6][359], ...) with the new MFNs. That is all OK, except when we revector we assume that the PFN count would be the same in the grafted P2M array and in the newly allocated. Since that is no longer the case, as we have holes in the P2M that point to p2m_missing or p2m_identity we have to take that into account. [v2: Check for overflow] [v3: Move within the __va check] [v4: Fix the computation] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 996ee2bf7bdb..c3e92912c3fb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -396,6 +396,7 @@ unsigned long __init xen_revector_p2m_tree(void) unsigned long va_start; unsigned long va_end; unsigned long pfn; + unsigned long pfn_free = 0; unsigned long *mfn_list = NULL; unsigned long size; @@ -442,11 +443,18 @@ unsigned long __init xen_revector_p2m_tree(void) if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { unsigned long *new; - new = &mfn_list[pfn]; + if (pfn_free > (size / sizeof(unsigned long))) { + WARN(1, "Only allocated for %ld pages, but we want %ld!\n", + size / sizeof(unsigned long), pfn_free); + return 0; + } + new = &mfn_list[pfn_free]; copy_page(new, mid_p); - p2m_top[topidx][mididx] = &mfn_list[pfn]; - p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn]); + p2m_top[topidx][mididx] = &mfn_list[pfn_free]; + p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); + + pfn_free += P2M_PER_PAGE; } /* This should be the leafs allocated for identity from _brk. */ From 328731876451a837f56e66ffa11de053ed5daf73 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Aug 2012 09:35:31 -0400 Subject: [PATCH 29/48] xen/mmu: If the revector fails, don't attempt to revector anything else. If the P2M revectoring would fail, we would try to continue on by cleaning the PMD for L1 (PTE) page-tables. The xen_cleanhighmap is greedy and erases the PMD on both boundaries. Since the P2M array can share the PMD, we would wipe out part of the __ka that is still used in the P2M tree to point to P2M leafs. This fixes it by bypassing the revectoring and continuing on. If the revector fails, a nice WARN is printed so we can still troubleshoot this. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5b2cb54425ce..cb9db72b33f8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1238,7 +1238,8 @@ static void __init xen_pagetable_setup_done(pgd_t *base) memblock_free(__pa(xen_start_info->mfn_list), size); /* And revector! Bye bye old array */ xen_start_info->mfn_list = new_mfn_list; - } + } else + goto skip; } /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1259,6 +1260,7 @@ static void __init xen_pagetable_setup_done(pgd_t *base) * anything at this stage. */ xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); #endif +skip: #endif xen_post_allocator_init(); } From 69870a847856a1ba81f655a8633fce5f5b614730 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 30 Aug 2012 13:58:11 +0100 Subject: [PATCH 30/48] xen/mm: return more precise error from xen_remap_domain_range() Callers of xen_remap_domain_range() need to know if the remap failed because frame is currently paged out. So they can retry the remap later on. Return -ENOENT in this case. This assumes that the error codes returned by Xen are a subset of those used by the kernel. It is unclear if this is defined as part of the hypercall ABI. Acked-by: Andres Lagar-Cavilla Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 885a22354a96..2d9e7c9c0e7b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2331,8 +2331,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, if (err) goto out; - err = -EFAULT; - if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) + err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); + if (err < 0) goto out; nr -= batch; From ceb90fa0a8008059ecbbf9114cb89dc71a730bb6 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Fri, 31 Aug 2012 09:59:30 -0400 Subject: [PATCH 31/48] xen/privcmd: add PRIVCMD_MMAPBATCH_V2 ioctl PRIVCMD_MMAPBATCH_V2 extends PRIVCMD_MMAPBATCH with an additional field for reporting the error code for every frame that could not be mapped. libxc prefers PRIVCMD_MMAPBATCH_V2 over PRIVCMD_MMAPBATCH. Also expand PRIVCMD_MMAPBATCH to return appropriate error-encoding top nibble in the mfn array. Signed-off-by: David Vrabel Signed-off-by: Andres Lagar-Cavilla Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/privcmd.c | 126 ++++++++++++++++++++++++++++++++---------- include/xen/privcmd.h | 24 +++++++- 2 files changed, 121 insertions(+), 29 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 85226cbeca24..92a285b1b96a 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -76,7 +76,7 @@ static void free_page_list(struct list_head *pages) */ static int gather_array(struct list_head *pagelist, unsigned nelem, size_t size, - void __user *data) + const void __user *data) { unsigned pageidx; void *pagedata; @@ -246,61 +246,117 @@ struct mmap_batch_state { domid_t domain; unsigned long va; struct vm_area_struct *vma; - int err; + /* A tristate: + * 0 for no errors + * 1 if at least one error has happened (and no + * -ENOENT errors have happened) + * -ENOENT if at least 1 -ENOENT has happened. + */ + int global_error; + /* An array for individual errors */ + int *err; - xen_pfn_t __user *user; + /* User-space mfn array to store errors in the second pass for V1. */ + xen_pfn_t __user *user_mfn; }; static int mmap_batch_fn(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + int ret; - if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, - st->vma->vm_page_prot, st->domain) < 0) { - *mfnp |= 0xf0000000U; - st->err++; + ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, + st->vma->vm_page_prot, st->domain); + + /* Store error code for second pass. */ + *(st->err++) = ret; + + /* And see if it affects the global_error. */ + if (ret < 0) { + if (ret == -ENOENT) + st->global_error = -ENOENT; + else { + /* Record that at least one error has happened. */ + if (st->global_error == 0) + st->global_error = 1; + } } st->va += PAGE_SIZE; return 0; } -static int mmap_return_errors(void *data, void *state) +static int mmap_return_errors_v1(void *data, void *state) { xen_pfn_t *mfnp = data; struct mmap_batch_state *st = state; + int err = *(st->err++); - return put_user(*mfnp, st->user++); + /* + * V1 encodes the error codes in the 32bit top nibble of the + * mfn (with its known limitations vis-a-vis 64 bit callers). + */ + *mfnp |= (err == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + return __put_user(*mfnp, st->user_mfn++); } static struct vm_operations_struct privcmd_vm_ops; -static long privcmd_ioctl_mmap_batch(void __user *udata) +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) { int ret; - struct privcmd_mmapbatch m; + struct privcmd_mmapbatch_v2 m; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long nr_pages; LIST_HEAD(pagelist); + int *err_array = NULL; struct mmap_batch_state state; if (!xen_initial_domain()) return -EPERM; - if (copy_from_user(&m, udata, sizeof(m))) - return -EFAULT; + switch (version) { + case 1: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) + return -EFAULT; + /* Returns per-frame error in m.arr. */ + m.err = NULL; + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + return -EFAULT; + break; + case 2: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) + return -EFAULT; + /* Returns per-frame error code in m.err. */ + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + return -EFAULT; + break; + default: + return -EINVAL; + } nr_pages = m.num; if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; - ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), - m.arr); + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); - if (ret || list_empty(&pagelist)) + if (ret) goto out; + if (list_empty(&pagelist)) { + ret = -EINVAL; + goto out; + } + + err_array = kcalloc(m.num, sizeof(int), GFP_KERNEL); + if (err_array == NULL) { + ret = -ENOMEM; + goto out; + } down_write(&mm->mmap_sem); @@ -315,24 +371,34 @@ static long privcmd_ioctl_mmap_batch(void __user *udata) goto out; } - state.domain = m.dom; - state.vma = vma; - state.va = m.addr; - state.err = 0; + state.domain = m.dom; + state.vma = vma; + state.va = m.addr; + state.global_error = 0; + state.err = err_array; - ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, mmap_batch_fn, &state); + /* mmap_batch_fn guarantees ret == 0 */ + BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state)); up_write(&mm->mmap_sem); - if (state.err > 0) { - state.user = m.arr; + if (state.global_error && (version == 1)) { + /* Write back errors in second pass. */ + state.user_mfn = (xen_pfn_t *)m.arr; + state.err = err_array; ret = traverse_pages(m.num, sizeof(xen_pfn_t), - &pagelist, - mmap_return_errors, &state); - } + &pagelist, mmap_return_errors_v1, &state); + } else + ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); + + /* If we have not had any EFAULT-like global errors then set the global + * error to -ENOENT if necessary. */ + if ((ret == 0) && (state.global_error == -ENOENT)) + ret = -ENOENT; out: + kfree(err_array); free_page_list(&pagelist); return ret; @@ -354,7 +420,11 @@ static long privcmd_ioctl(struct file *file, break; case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata); + ret = privcmd_ioctl_mmap_batch(udata, 1); + break; + + case IOCTL_PRIVCMD_MMAPBATCH_V2: + ret = privcmd_ioctl_mmap_batch(udata, 2); break; default: diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h index 45c1aa14b83d..a85316811d79 100644 --- a/include/xen/privcmd.h +++ b/include/xen/privcmd.h @@ -58,13 +58,33 @@ struct privcmd_mmapbatch { int num; /* number of pages to populate */ domid_t dom; /* target domain */ __u64 addr; /* virtual address */ - xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ + xen_pfn_t __user *arr; /* array of mfns - or'd with + PRIVCMD_MMAPBATCH_*_ERROR on err */ +}; + +#define PRIVCMD_MMAPBATCH_MFN_ERROR 0xf0000000U +#define PRIVCMD_MMAPBATCH_PAGED_ERROR 0x80000000U + +struct privcmd_mmapbatch_v2 { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + const xen_pfn_t __user *arr; /* array of mfns */ + int __user *err; /* array of error codes */ }; /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t * Return: Value returned from execution of the specified hypercall. + * + * @cmd: IOCTL_PRIVCMD_MMAPBATCH_V2 + * @arg: &struct privcmd_mmapbatch_v2 + * Return: 0 on success (i.e., arg->err contains valid error codes for + * each frame). On an error other than a failed frame remap, -1 is + * returned and errno is set to EINVAL, EFAULT etc. As an exception, + * if the operation was otherwise successful but any frame failed with + * -ENOENT, then -1 is returned and errno is set to ENOENT. */ #define IOCTL_PRIVCMD_HYPERCALL \ _IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall)) @@ -72,5 +92,7 @@ struct privcmd_mmapbatch { _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap)) #define IOCTL_PRIVCMD_MMAPBATCH \ _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch)) +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ + _IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ From 1714df7f2cee6a741c3ed24231ec5db25b90633a Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Thu, 6 Sep 2012 13:24:39 -0400 Subject: [PATCH 32/48] xen/privcmd: Fix mmap batch ioctl error status copy back. Copy back of per-slot error codes is only necessary for V2. V1 does not provide an error array, so copyback will unconditionally set the global rc to EFAULT. Only copyback for V2. Signed-off-by: Andres Lagar-Cavilla Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/privcmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 92a285b1b96a..3b162c656205 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -389,7 +389,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) state.err = err_array; ret = traverse_pages(m.num, sizeof(xen_pfn_t), &pagelist, mmap_return_errors_v1, &state); - } else + } else if (version == 2) ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); /* If we have not had any EFAULT-like global errors then set the global From 9d2be9287107695708e6aae5105a8a518a6cb4d0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 8 Sep 2012 12:57:35 +0300 Subject: [PATCH 33/48] xen/privcmd: return -EFAULT on error __copy_to_user() returns the number of bytes remaining to be copied but we want to return a negative error code here. Acked-by: Andres Lagar-Cavilla Signed-off-by: Dan Carpenter Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/privcmd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 3b162c656205..ef6389580b8c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -389,8 +389,11 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) state.err = err_array; ret = traverse_pages(m.num, sizeof(xen_pfn_t), &pagelist, mmap_return_errors_v1, &state); - } else if (version == 2) + } else if (version == 2) { ret = __copy_to_user(m.err, err_array, m.num * sizeof(int)); + if (ret) + ret = -EFAULT; + } /* If we have not had any EFAULT-like global errors then set the global * error to -ENOENT if necessary. */ From e58f5b55113b8fd4eb8eb43f5508d87e4862f280 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 14 Sep 2012 08:19:01 +0100 Subject: [PATCH 34/48] xen: resynchronise grant table status codes with upstream Adds GNTST_address_too_big and GNTST_eagain. Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/interface/grant_table.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h index 7da811bdd558..f9f8b975ae74 100644 --- a/include/xen/interface/grant_table.h +++ b/include/xen/interface/grant_table.h @@ -519,7 +519,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version); #define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */ #define GNTST_permission_denied (-8) /* Not enough privilege for operation. */ #define GNTST_bad_page (-9) /* Specified page was invalid for op. */ -#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary */ +#define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary. */ +#define GNTST_address_too_big (-11) /* transfer page address too large. */ +#define GNTST_eagain (-12) /* Operation not done; try again. */ #define GNTTABOP_error_msgs { \ "okay", \ @@ -532,7 +534,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version); "no spare translation slot in the I/O MMU", \ "permission denied", \ "bad page", \ - "copy arguments cross page boundary" \ + "copy arguments cross page boundary", \ + "page address size too large", \ + "operation not done; try again" \ } #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */ From ecc635f90adfe1b7cd5fd354f49edfbf24aa4e3e Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 14 Sep 2012 12:13:12 +0100 Subject: [PATCH 35/48] xen/arm: compile and run xenbus bind_evtchn_to_irqhandler can legitimately return 0 (irq 0): it is not an error. If Linux is running as an HVM domain and is running as Dom0, use xenstored_local_init to initialize the xenstore page and event channel. Changes in v4: - do not xs_reset_watches on dom0. Changes in v2: - refactor xenbus_init. Signed-off-by: Stefano Stabellini [v5: Fixed case switch indentations] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xenbus/xenbus_comms.c | 2 +- drivers/xen/xenbus/xenbus_probe.c | 52 ++++++++++++++++++++++--------- drivers/xen/xenbus/xenbus_xs.c | 3 +- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index 52fe7ad07666..c5aa55c5d371 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -224,7 +224,7 @@ int xb_init_comms(void) int err; err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting, 0, "xenbus", &xb_waitq); - if (err <= 0) { + if (err < 0) { printk(KERN_ERR "XENBUS request irq failed %i\n", err); return err; } diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 91d3d6544a7b..038b71dbf03c 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -719,17 +719,47 @@ static int __init xenstored_local_init(void) return err; } +enum xenstore_init { + UNKNOWN, + PV, + HVM, + LOCAL, +}; static int __init xenbus_init(void) { int err = 0; + enum xenstore_init usage = UNKNOWN; + uint64_t v = 0; if (!xen_domain()) return -ENODEV; xenbus_ring_ops_init(); - if (xen_hvm_domain()) { - uint64_t v = 0; + if (xen_pv_domain()) + usage = PV; + if (xen_hvm_domain()) + usage = HVM; + if (xen_hvm_domain() && xen_initial_domain()) + usage = LOCAL; + if (xen_pv_domain() && !xen_start_info->store_evtchn) + usage = LOCAL; + if (xen_pv_domain() && xen_start_info->store_evtchn) + xenstored_ready = 1; + + switch (usage) { + case LOCAL: + err = xenstored_local_init(); + if (err) + goto out_error; + xen_store_interface = mfn_to_virt(xen_store_mfn); + break; + case PV: + xen_store_evtchn = xen_start_info->store_evtchn; + xen_store_mfn = xen_start_info->store_mfn; + xen_store_interface = mfn_to_virt(xen_store_mfn); + break; + case HVM: err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); if (err) goto out_error; @@ -738,18 +768,12 @@ static int __init xenbus_init(void) if (err) goto out_error; xen_store_mfn = (unsigned long)v; - xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); - } else { - xen_store_evtchn = xen_start_info->store_evtchn; - xen_store_mfn = xen_start_info->store_mfn; - if (xen_store_evtchn) - xenstored_ready = 1; - else { - err = xenstored_local_init(); - if (err) - goto out_error; - } - xen_store_interface = mfn_to_virt(xen_store_mfn); + xen_store_interface = + ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); + break; + default: + pr_warn("Xenstore state unknown\n"); + break; } /* Initialize the interface to xenstore. */ diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index bce15cf4a8df..131dec04794e 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include "xenbus_comms.h" @@ -622,7 +623,7 @@ static void xs_reset_watches(void) { int err, supported = 0; - if (!xen_hvm_domain()) + if (!xen_hvm_domain() || xen_initial_domain()) return; err = xenbus_scanf(XBT_NIL, "control", From 1cef36a529f44dbb3612ee0deeb0b5563de36163 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 23 Aug 2012 13:55:26 -0400 Subject: [PATCH 36/48] xen/swiotlb: Move the nr_tbl determination in its own function. Moving the function out of the way to prepare for the late SWIOTLB init. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 1afb4fba11b4..a2aad6ec2401 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -144,25 +144,26 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) } while (i < nslabs); return 0; } +static unsigned long xen_set_nslabs(unsigned long nr_tbl) +{ + if (!nr_tbl) { + xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); + xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); + } else + xen_io_tlb_nslabs = nr_tbl; + return xen_io_tlb_nslabs << IO_TLB_SHIFT; +} void __init xen_swiotlb_init(int verbose) { unsigned long bytes; int rc = -ENOMEM; - unsigned long nr_tbl; char *m = NULL; unsigned int repeat = 3; - nr_tbl = swiotlb_nr_tbl(); - if (nr_tbl) - xen_io_tlb_nslabs = nr_tbl; - else { - xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); - xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); - } + xen_io_tlb_nslabs = swiotlb_nr_tbl(); retry: - bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; - + bytes = xen_set_nslabs(xen_io_tlb_nslabs); /* * Get IO TLB memory from any location. */ From 5bab7864b1167f9a72d375f6854027db436a1cc1 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 23 Aug 2012 14:03:55 -0400 Subject: [PATCH 37/48] xen/swiotlb: Move the error strings to its own function. That way we can more easily reuse those errors when using the late SWIOTLB init. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index a2aad6ec2401..701b1035fa6f 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -154,11 +154,33 @@ static unsigned long xen_set_nslabs(unsigned long nr_tbl) return xen_io_tlb_nslabs << IO_TLB_SHIFT; } + +enum xen_swiotlb_err { + XEN_SWIOTLB_UNKNOWN = 0, + XEN_SWIOTLB_ENOMEM, + XEN_SWIOTLB_EFIXUP +}; + +static const char *xen_swiotlb_error(enum xen_swiotlb_err err) +{ + switch (err) { + case XEN_SWIOTLB_ENOMEM: + return "Cannot allocate Xen-SWIOTLB buffer\n"; + case XEN_SWIOTLB_EFIXUP: + return "Failed to get contiguous memory for DMA from Xen!\n"\ + "You either: don't have the permissions, do not have"\ + " enough free memory under 4GB, or the hypervisor memory"\ + " is too fragmented!"; + default: + break; + } + return ""; +} void __init xen_swiotlb_init(int verbose) { unsigned long bytes; int rc = -ENOMEM; - char *m = NULL; + enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; unsigned int repeat = 3; xen_io_tlb_nslabs = swiotlb_nr_tbl(); @@ -169,7 +191,7 @@ void __init xen_swiotlb_init(int verbose) */ xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); if (!xen_io_tlb_start) { - m = "Cannot allocate Xen-SWIOTLB buffer!\n"; + m_ret = XEN_SWIOTLB_ENOMEM; goto error; } xen_io_tlb_end = xen_io_tlb_start + bytes; @@ -181,10 +203,7 @@ void __init xen_swiotlb_init(int verbose) xen_io_tlb_nslabs); if (rc) { free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes)); - m = "Failed to get contiguous memory for DMA from Xen!\n"\ - "You either: don't have the permissions, do not have"\ - " enough free memory under 4GB, or the hypervisor memory"\ - "is too fragmented!"; + m_ret = XEN_SWIOTLB_EFIXUP; goto error; } start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); @@ -199,8 +218,8 @@ void __init xen_swiotlb_init(int verbose) (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); goto retry; } - xen_raw_printk("%s (rc:%d)", m, rc); - panic("%s (rc:%d)", m, rc); + xen_raw_printk("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); + panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); } void * From b82776005369899c1c7ca2e4b2414bb64b538d2c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 23 Aug 2012 14:36:15 -0400 Subject: [PATCH 38/48] xen/swiotlb: Use the swiotlb_late_init_with_tbl to init Xen-SWIOTLB late when PV PCI is used. With this patch we provide the functionality to initialize the Xen-SWIOTLB late in the bootup cycle - specifically for Xen PCI-frontend. We still will work if the user had supplied 'iommu=soft' on the Linux command line. Note: We cannot depend on after_bootmem to automatically determine whether this is early or not. This is because when PCI IOMMUs are initialized it is after after_bootmem but before a lot of "other" subsystems are initialized. CC: FUJITA Tomonori [v1: Fix smatch warnings] [v2: Added check for xen_swiotlb] [v3: Rebased with new xen-swiotlb changes] [v4: squashed xen/swiotlb: Depending on after_bootmem is not correct in] Reviewed-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/swiotlb-xen.h | 2 ++ arch/x86/xen/pci-swiotlb-xen.c | 24 +++++++++++-- drivers/xen/swiotlb-xen.c | 48 ++++++++++++++++++++------ include/xen/swiotlb-xen.h | 2 +- 4 files changed, 63 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 1be1ab7d6a41..ee52fcac6f72 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -5,10 +5,12 @@ extern int xen_swiotlb; extern int __init pci_xen_swiotlb_detect(void); extern void __init pci_xen_swiotlb_init(void); +extern int pci_xen_swiotlb_init_late(void); #else #define xen_swiotlb (0) static inline int __init pci_xen_swiotlb_detect(void) { return 0; } static inline void __init pci_xen_swiotlb_init(void) { } +static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif #endif /* _ASM_X86_SWIOTLB_XEN_H */ diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 1c1722761eec..b152640d8388 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -12,7 +12,7 @@ #include #include #endif - +#include int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { @@ -69,13 +69,33 @@ int __init pci_xen_swiotlb_detect(void) void __init pci_xen_swiotlb_init(void) { if (xen_swiotlb) { - xen_swiotlb_init(1); + xen_swiotlb_init(1, true /* early */); dma_ops = &xen_swiotlb_dma_ops; /* Make sure ACS will be enabled */ pci_request_acs(); } } + +int pci_xen_swiotlb_init_late(void) +{ + int rc; + + if (xen_swiotlb) + return 0; + + rc = xen_swiotlb_init(1, false /* late */); + if (rc) + return rc; + + dma_ops = &xen_swiotlb_dma_ops; + /* Make sure ACS will be enabled */ + pci_request_acs(); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); + IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, 0, pci_xen_swiotlb_init, diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 701b1035fa6f..7461edb5118e 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -176,9 +176,9 @@ static const char *xen_swiotlb_error(enum xen_swiotlb_err err) } return ""; } -void __init xen_swiotlb_init(int verbose) +int __ref xen_swiotlb_init(int verbose, bool early) { - unsigned long bytes; + unsigned long bytes, order; int rc = -ENOMEM; enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; unsigned int repeat = 3; @@ -186,10 +186,28 @@ void __init xen_swiotlb_init(int verbose) xen_io_tlb_nslabs = swiotlb_nr_tbl(); retry: bytes = xen_set_nslabs(xen_io_tlb_nslabs); + order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); /* * Get IO TLB memory from any location. */ - xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); + if (early) + xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); + else { +#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) +#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) + while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { + xen_io_tlb_start = (void *)__get_free_pages(__GFP_NOWARN, order); + if (xen_io_tlb_start) + break; + order--; + } + if (order != get_order(bytes)) { + pr_warn("Warning: only able to allocate %ld MB " + "for software IO TLB\n", (PAGE_SIZE << order) >> 20); + xen_io_tlb_nslabs = SLABS_PER_PAGE << order; + bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + } + } if (!xen_io_tlb_start) { m_ret = XEN_SWIOTLB_ENOMEM; goto error; @@ -202,14 +220,21 @@ void __init xen_swiotlb_init(int verbose) bytes, xen_io_tlb_nslabs); if (rc) { - free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes)); + if (early) + free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes)); + else { + free_pages((unsigned long)xen_io_tlb_start, order); + xen_io_tlb_start = NULL; + } m_ret = XEN_SWIOTLB_EFIXUP; goto error; } start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); - swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); - - return; + if (early) + swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); + else + rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); + return rc; error: if (repeat--) { xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ @@ -218,10 +243,13 @@ void __init xen_swiotlb_init(int verbose) (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); goto retry; } - xen_raw_printk("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); - panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); + pr_err("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); + if (early) + panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); + else + free_pages((unsigned long)xen_io_tlb_start, order); + return rc; } - void * xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index 4f4d449f00f6..289ee509bda9 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -3,7 +3,7 @@ #include -extern void xen_swiotlb_init(int verbose); +extern int xen_swiotlb_init(int verbose, bool early); extern void *xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, From c468bdee28a1cb61d9b7a8ce9859d17dee43b7d7 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 17 Sep 2012 10:20:09 -0400 Subject: [PATCH 39/48] xen/swiotlb: For early initialization, return zero on success. If everything is setup properly we would return -ENOMEM since rc by default is set to that value. Lets not do that and return a proper return code. Note: The reason the early code needs this special treatment is that it SWIOTLB library call does not return anything (and had it failed it would call panic()) - but our function does. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 7461edb5118e..b70f2d19ab99 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -230,9 +230,10 @@ int __ref xen_swiotlb_init(int verbose, bool early) goto error; } start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); - if (early) + if (early) { swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); - else + rc = 0; + } else rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); return rc; error: From 3d925320e9e2de162bd138bf97816bda8c3f71be Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 31 Jul 2012 06:26:59 -0400 Subject: [PATCH 40/48] xen/pcifront: Use Xen-SWIOTLB when initting if required. We piggyback on "xen/swiotlb: Use the swiotlb_late_init_with_tbl to init Xen-SWIOTLB late when PV PCI is used." functionality to start up the Xen-SWIOTLB if we are hot-plugged. This allows us to bypass the need to supply 'iommu=soft' on the Linux command line (mostly). With this patch, if a user forgot 'iommu=soft' on the command line, and hotplug a PCI device they will get: pcifront pci-0: Installing PCI frontend Warning: only able to allocate 4 MB for software IO TLB software IO TLB [mem 0x2a000000-0x2a3fffff] (4MB) mapped at [ffff88002a000000-ffff88002a3fffff] pcifront pci-0: Creating PCI Frontend Bus 0000:00 pcifront pci-0: PCI host bridge to bus 0000:00 pci_bus 0000:00: root bus resource [io 0x0000-0xffff] pci_bus 0000:00: root bus resource [mem 0x00000000-0xfffffffff] pci 0000:00:00.0: [8086:10d3] type 00 class 0x020000 pci 0000:00:00.0: reg 10: [mem 0xfe5c0000-0xfe5dffff] pci 0000:00:00.0: reg 14: [mem 0xfe500000-0xfe57ffff] pci 0000:00:00.0: reg 18: [io 0xe000-0xe01f] pci 0000:00:00.0: reg 1c: [mem 0xfe5e0000-0xfe5e3fff] pcifront pci-0: claiming resource 0000:00:00.0/0 pcifront pci-0: claiming resource 0000:00:00.0/1 pcifront pci-0: claiming resource 0000:00:00.0/2 pcifront pci-0: claiming resource 0000:00:00.0/3 e1000e: Intel(R) PRO/1000 Network Driver - 2.0.0-k e1000e: Copyright(c) 1999 - 2012 Intel Corporation. e1000e 0000:00:00.0: Disabling ASPM L0s L1 e1000e 0000:00:00.0: enabling device (0000 -> 0002) e1000e 0000:00:00.0: Xen PCI mapped GSI16 to IRQ34 e1000e 0000:00:00.0: (unregistered net_device): Interrupt Throttling Rate (ints/sec) set to dynamic conservative mode e1000e 0000:00:00.0: eth0: (PCI Express:2.5GT/s:Width x1) 00:1b:21:ab:c6:13 e1000e 0000:00:00.0: eth0: Intel(R) PRO/1000 Network Connection e1000e 0000:00:00.0: eth0: MAC: 3, PHY: 8, PBA No: E46981-005 The "Warning only" will go away if one supplies 'iommu=soft' instead as we have a higher chance of being able to allocate large swaths of memory. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/pci/xen-pcifront.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index d6cc62cb4cf7..0ad8ca326389 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -21,6 +21,7 @@ #include #include +#include #define INVALID_GRANT_REF (0) #define INVALID_EVTCHN (-1) @@ -668,7 +669,7 @@ static irqreturn_t pcifront_handler_aer(int irq, void *dev) schedule_pcifront_aer_op(pdev); return IRQ_HANDLED; } -static int pcifront_connect(struct pcifront_device *pdev) +static int pcifront_connect_and_init_dma(struct pcifront_device *pdev) { int err = 0; @@ -681,9 +682,13 @@ static int pcifront_connect(struct pcifront_device *pdev) dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n"); err = -EEXIST; } - spin_unlock(&pcifront_dev_lock); + if (!err && !swiotlb_nr_tbl()) { + err = pci_xen_swiotlb_init_late(); + if (err) + dev_err(&pdev->xdev->dev, "Could not setup SWIOTLB!\n"); + } return err; } @@ -842,10 +847,10 @@ static int __devinit pcifront_try_connect(struct pcifront_device *pdev) XenbusStateInitialised) goto out; - err = pcifront_connect(pdev); + err = pcifront_connect_and_init_dma(pdev); if (err) { xenbus_dev_fatal(pdev->xdev, err, - "Error connecting PCI Frontend"); + "Error setting up PCI Frontend"); goto out; } From e815f45e6e5fe40ca3049987eab80d89d5781f03 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 13 Aug 2012 13:26:11 -0400 Subject: [PATCH 41/48] xen/swiotlb: Remove functions not needed anymore. Sparse warns us off: drivers/xen/swiotlb-xen.c:506:1: warning: symbol 'xen_swiotlb_map_sg' was not declared. Should it be static? drivers/xen/swiotlb-xen.c:534:1: warning: symbol 'xen_swiotlb_unmap_sg' was not declared. Should it be static? and it looks like we do not need this function at all. Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 16 ---------------- include/xen/swiotlb-xen.h | 9 --------- 2 files changed, 25 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index b70f2d19ab99..ab4c66c19c1a 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -515,14 +515,6 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, } EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); -int -xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg); - /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules * concerning calls here are the same as for swiotlb_unmap_page() above. @@ -543,14 +535,6 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); -void -xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); -} -EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg); - /* * Make physical memory consistent for a set of streaming mode DMA translations * after a transfer. diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index 289ee509bda9..de8bcc641c49 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -23,15 +23,6 @@ extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, extern void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs); -/* -extern int -xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, - enum dma_data_direction dir); - -extern void -xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, - enum dma_data_direction dir); -*/ extern int xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir, From 2a3bce8f6afb9118a7ac3c360a5baf7cdaec87bc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 13 Aug 2012 11:00:08 -0400 Subject: [PATCH 42/48] xen/swiotlb: Fix compile warnings when using plain integer instead of NULL pointer. arch/x86/xen/pci-swiotlb-xen.c:96:1: warning: Using plain integer as NULL pointer arch/x86/xen/pci-swiotlb-xen.c:96:1: warning: Using plain integer as NULL pointer Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/pci-swiotlb-xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index b152640d8388..1608244756de 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -97,6 +97,6 @@ int pci_xen_swiotlb_init_late(void) EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, - 0, + NULL, pci_xen_swiotlb_init, - 0); + NULL); From c3cb4709809e655a4ba5a716086c8bc5bbbbccdb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 18 Sep 2012 12:29:03 +0100 Subject: [PATCH 43/48] xen-pciback: support wild cards in slot specifications Particularly for hiding sets of SR-IOV devices, specifying them all individually is rather cumbersome. Therefore, allow function and slot numbers to be replaced by a wildcard character ('*'). Unfortunately this gets complicated by the in-kernel sscanf() implementation not being really standard conformant - matching of plain text tails cannot be checked by the caller (a patch to overcome this will be sent shortly, and a follow-up patch for simplifying the code is planned to be sent when that fixed went upstream). Signed-off-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xen-pciback/pci_stub.c | 89 +++++++++++++++++++++++++++--- 1 file changed, 82 insertions(+), 7 deletions(-) diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 03342728bf23..20e1c42c1c48 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -897,17 +897,41 @@ static inline int str_to_slot(const char *buf, int *domain, int *bus, int *slot, int *func) { int err; + char wc = '*'; err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func); - if (err == 4) + switch (err) { + case 3: + *func = -1; + err = sscanf(buf, " %x:%x:%x.%c", domain, bus, slot, &wc); + break; + case 2: + *slot = *func = -1; + err = sscanf(buf, " %x:%x:*.%c", domain, bus, &wc); + if (err >= 2) + ++err; + break; + } + if (err == 4 && wc == '*') return 0; else if (err < 0) return -EINVAL; /* try again without domain */ *domain = 0; + wc = '*'; err = sscanf(buf, " %x:%x.%x", bus, slot, func); - if (err == 3) + switch (err) { + case 2: + *func = -1; + err = sscanf(buf, " %x:%x.%c", bus, slot, &wc); + break; + case 1: + *slot = *func = -1; + err = sscanf(buf, " %x:*.%c", bus, &wc) + 1; + break; + } + if (err == 3 && wc == '*') return 0; return -EINVAL; @@ -930,6 +954,19 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func) { struct pcistub_device_id *pci_dev_id; unsigned long flags; + int rc = 0; + + if (slot < 0) { + for (slot = 0; !rc && slot < 32; ++slot) + rc = pcistub_device_id_add(domain, bus, slot, func); + return rc; + } + + if (func < 0) { + for (func = 0; !rc && func < 8; ++func) + rc = pcistub_device_id_add(domain, bus, slot, func); + return rc; + } pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); if (!pci_dev_id) @@ -952,15 +989,15 @@ static int pcistub_device_id_add(int domain, int bus, int slot, int func) static int pcistub_device_id_remove(int domain, int bus, int slot, int func) { struct pcistub_device_id *pci_dev_id, *t; - int devfn = PCI_DEVFN(slot, func); int err = -ENOENT; unsigned long flags; spin_lock_irqsave(&device_ids_lock, flags); list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) { - if (pci_dev_id->domain == domain - && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) { + if (pci_dev_id->domain == domain && pci_dev_id->bus == bus + && (slot < 0 || PCI_SLOT(pci_dev_id->devfn) == slot) + && (func < 0 || PCI_FUNC(pci_dev_id->devfn) == func)) { /* Don't break; here because it's possible the same * slot could be in the list more than once */ @@ -1216,6 +1253,10 @@ static ssize_t permissive_add(struct device_driver *drv, const char *buf, err = str_to_slot(buf, &domain, &bus, &slot, &func); if (err) goto out; + if (slot < 0 || func < 0) { + err = -EINVAL; + goto out; + } psdev = pcistub_device_find(domain, bus, slot, func); if (!psdev) { err = -ENODEV; @@ -1297,17 +1338,51 @@ static int __init pcistub_init(void) if (pci_devs_to_hide && *pci_devs_to_hide) { do { + char wc = '*'; + parsed = 0; err = sscanf(pci_devs_to_hide + pos, " (%x:%x:%x.%x) %n", &domain, &bus, &slot, &func, &parsed); - if (err != 4) { + switch (err) { + case 3: + func = -1; + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x:%x.%c) %n", + &domain, &bus, &slot, &wc, + &parsed); + break; + case 2: + slot = func = -1; + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x:*.%c) %n", + &domain, &bus, &wc, &parsed) + 1; + break; + } + + if (err != 4 || wc != '*') { domain = 0; + wc = '*'; err = sscanf(pci_devs_to_hide + pos, " (%x:%x.%x) %n", &bus, &slot, &func, &parsed); - if (err != 3) + switch (err) { + case 2: + func = -1; + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x.%c) %n", + &bus, &slot, &wc, + &parsed); + break; + case 1: + slot = func = -1; + err = sscanf(pci_devs_to_hide + pos, + " (%x:*.%c) %n", + &bus, &wc, &parsed) + 1; + break; + } + if (err != 3 || wc != '*') goto parse_error; } From c571898ffc24a1768e1b2dabeac0fc7dd4c14601 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Fri, 14 Sep 2012 14:26:59 +0000 Subject: [PATCH 44/48] xen/gndev: Xen backend support for paged out grant targets V4. Since Xen-4.2, hvm domains may have portions of their memory paged out. When a foreign domain (such as dom0) attempts to map these frames, the map will initially fail. The hypervisor returns a suitable errno, and kicks an asynchronous page-in operation carried out by a helper. The foreign domain is expected to retry the mapping operation until it eventually succeeds. The foreign domain is not put to sleep because itself could be the one running the pager assist (typical scenario for dom0). This patch adds support for this mechanism for backend drivers using grant mapping and copying operations. Specifically, this covers the blkback and gntdev drivers (which map foreign grants), and the netback driver (which copies foreign grants). * Add a retry method for grants that fail with GNTST_eagain (i.e. because the target foreign frame is paged out). * Insert hooks with appropriate wrappers in the aforementioned drivers. The retry loop is only invoked if the grant operation status is GNTST_eagain. It guarantees to leave a new status code different from GNTST_eagain. Any other status code results in identical code execution as before. The retry loop performs 256 attempts with increasing time intervals through a 32 second period. It uses msleep to yield while waiting for the next retry. V2 after feedback from David Vrabel: * Explicit MAX_DELAY instead of wrap-around delay into zero * Abstract GNTST_eagain check into core grant table code for netback module. V3 after feedback from Ian Campbell: * Add placeholder in array of grant table error descriptions for unrelated error code we jump over. * Eliminate single map and retry macro in favor of a generic batch flavor. * Some renaming. * Bury most implementation in grant_table.c, cleaner interface. V4 rebased on top of sync of Xen grant table interface headers. Signed-off-by: Andres Lagar-Cavilla Acked-by: Ian Campbell [v5: Fixed whitespace issues] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/net/xen-netback/netback.c | 11 ++----- drivers/xen/grant-table.c | 53 ++++++++++++++++++++++++++++++ drivers/xen/xenbus/xenbus_client.c | 6 ++-- include/xen/grant_table.h | 12 +++++++ 4 files changed, 70 insertions(+), 12 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 682633bfe00f..05593d882023 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -635,9 +635,7 @@ static void xen_netbk_rx_action(struct xen_netbk *netbk) return; BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op)); - ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, &netbk->grant_copy_op, - npo.copy_prod); - BUG_ON(ret != 0); + gnttab_batch_copy(netbk->grant_copy_op, npo.copy_prod); while ((skb = __skb_dequeue(&rxq)) != NULL) { sco = (struct skb_cb_overlay *)skb->cb; @@ -1460,18 +1458,15 @@ static void xen_netbk_tx_submit(struct xen_netbk *netbk) static void xen_netbk_tx_action(struct xen_netbk *netbk) { unsigned nr_gops; - int ret; nr_gops = xen_netbk_tx_build_gops(netbk); if (nr_gops == 0) return; - ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, - netbk->tx_copy_ops, nr_gops); - BUG_ON(ret); + + gnttab_batch_copy(netbk->tx_copy_ops, nr_gops); xen_netbk_tx_submit(netbk); - } static void xen_netbk_idx_release(struct xen_netbk *netbk, u16 pending_idx) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index eea81cf8a2a5..3a567b15600b 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -823,6 +824,52 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +/* Handling of paged out grant targets (GNTST_eagain) */ +#define MAX_DELAY 256 +static inline void +gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status, + const char *func) +{ + unsigned delay = 1; + + do { + BUG_ON(HYPERVISOR_grant_table_op(cmd, gop, 1)); + if (*status == GNTST_eagain) + msleep(delay++); + } while ((*status == GNTST_eagain) && (delay < MAX_DELAY)); + + if (delay >= MAX_DELAY) { + printk(KERN_ERR "%s: %s eagain grant\n", func, current->comm); + *status = GNTST_bad_page; + } +} + +void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count) +{ + struct gnttab_map_grant_ref *op; + + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, batch, count)) + BUG(); + for (op = batch; op < batch + count; op++) + if (op->status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, op, + &op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_map); + +void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) +{ + struct gnttab_copy *op; + + if (HYPERVISOR_grant_table_op(GNTTABOP_copy, batch, count)) + BUG(); + for (op = batch; op < batch + count; op++) + if (op->status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_copy, op, + &op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_copy); + int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) @@ -836,6 +883,12 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, if (ret) return ret; + /* Retry eagain maps */ + for (i = 0; i < count; i++) + if (map_ops[i].status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, + &map_ops[i].status, __func__); + if (xen_feature(XENFEAT_auto_translated_physmap)) return ret; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index b3e146edb51d..bcf3ba4a6ec1 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -490,8 +490,7 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, op.host_addr = arbitrary_virt_to_machine(pte).maddr; - if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) - BUG(); + gnttab_batch_map(&op, 1); if (op.status != GNTST_okay) { free_vm_area(area); @@ -572,8 +571,7 @@ int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref, gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map, gnt_ref, dev->otherend_id); - if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) - BUG(); + gnttab_batch_map(&op, 1); if (op.status != GNTST_okay) { xenbus_dev_fatal(dev, op.status, diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 11e27c3af3cb..ba0d77529a29 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -189,4 +189,16 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, struct page **pages, unsigned int count, bool clear_pte); +/* Perform a batch of grant map/copy operations. Retry every batch slot + * for which the hypervisor returns GNTST_eagain. This is typically due + * to paged out target frames. + * + * Will retry for 1, 2, ... 255 ms, i.e. 256 times during 32 seconds. + * + * Return value in each iand every status field of the batch guaranteed + * to not be GNTST_eagain. + */ +void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count); +void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count); + #endif /* __ASM_GNTTAB_H__ */ From ffb8b233c2261b7978dc3bd759aaa19bd1a7fadf Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 21 Sep 2012 12:30:35 -0400 Subject: [PATCH 45/48] xen/x86: retrieve keyboard shift status flags from hypervisor. The xen c/s 25873 allows the hypervisor to retrieve the NUMLOCK flag. With this patch, the Linux kernel can get the state according to the data in the BIOS. Acked-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 8 ++++++++ include/xen/interface/platform.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 47b3acdc2ac5..67897152237c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1440,11 +1440,19 @@ asmlinkage void __init xen_start_kernel(void) const struct dom0_vga_console_info *info = (void *)((char *)xen_start_info + xen_start_info->console.dom0.info_off); + struct xen_platform_op op = { + .cmd = XENPF_firmware_info, + .interface_version = XENPF_INTERFACE_VERSION, + .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, + }; xen_init_vga(info, xen_start_info->console.dom0.info_size); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; + if (HYPERVISOR_dom0_op(&op) == 0) + boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; + xen_init_apic(); /* Make sure ACS will be enabled */ diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h index a3275a850e54..54ad6f9e4725 100644 --- a/include/xen/interface/platform.h +++ b/include/xen/interface/platform.h @@ -112,6 +112,7 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_platform_quirk_t); #define XEN_FW_DISK_INFO 1 /* from int 13 AH=08/41/48 */ #define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */ #define XEN_FW_VBEDDC_INFO 3 /* from int 10 AX=4f15 */ +#define XEN_FW_KBD_SHIFT_FLAGS 5 /* Int16, Fn02: Get keyboard shift flags. */ struct xenpf_firmware_info { /* IN variables. */ uint32_t type; @@ -142,6 +143,8 @@ struct xenpf_firmware_info { /* must refer to 128-byte buffer */ GUEST_HANDLE(uchar) edid; } vbeddc_info; /* XEN_FW_VBEDDC_INFO */ + + uint8_t kbd_shift_flags; /* XEN_FW_KBD_SHIFT_FLAGS */ } u; }; DEFINE_GUEST_HANDLE_STRUCT(xenpf_firmware_info_t); From aa387d630cfed1a694a9c8c61fba3877ba8d4f07 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 9 Feb 2012 11:33:51 +0800 Subject: [PATCH 46/48] xen/vga: add the xen EFI video mode support In order to add xen EFI frambebuffer video support, it is required to add xen-efi's new video type (XEN_VGATYPE_EFI_LFB) case and handle it in the function xen_init_vga and set the video type to VIDEO_TYPE_EFI to enable efi video mode. The original patch from which this was broken out from: http://marc.info/?i=4E099AA6020000780004A4C6@nat28.tlf.novell.com Signed-off-by: Jan Beulich Signed-off-by: Tang Liang [v2: The original author is Jan Beulich and Liang Tang ported it to upstream] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/vga.c | 7 +++++++ include/xen/interface/xen.h | 1 + 2 files changed, 8 insertions(+) diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index 1cd7f4d11e29..6722e3733f02 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -35,6 +35,7 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) info->u.text_mode_3.font_height; break; + case XEN_VGATYPE_EFI_LFB: case XEN_VGATYPE_VESA_LFB: if (size < offsetof(struct dom0_vga_console_info, u.vesa_lfb.gbl_caps)) @@ -54,6 +55,12 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) screen_info->blue_pos = info->u.vesa_lfb.blue_pos; screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; + + if (info->video_type == XEN_VGATYPE_EFI_LFB) { + screen_info->orig_video_isVGA = VIDEO_TYPE_EFI; + break; + } + if (size >= offsetof(struct dom0_vga_console_info, u.vesa_lfb.gbl_caps) + sizeof(info->u.vesa_lfb.gbl_caps)) diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 1e0df6b7d3b3..886a5d80a18f 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -454,6 +454,7 @@ struct dom0_vga_console_info { uint8_t video_type; #define XEN_VGATYPE_TEXT_MODE_3 0x03 #define XEN_VGATYPE_VESA_LFB 0x23 +#define XEN_VGATYPE_EFI_LFB 0x70 union { struct { From e6aa70a0d5511296ea3d5fd1f5e2203ff6898107 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 24 Sep 2012 15:55:37 +0100 Subject: [PATCH 47/48] xen-pciback: properly clean up after calling pcistub_device_find() As the function calls pcistub_device_get() before returning non-NULL, its callers need to take care of calling pcistub_device_put() on (mostly, but not exclusively) error paths. Otoh, the function already guarantees that the 'dev' member is non-NULL upon successful return, so callers do not need to check for this a second time. Signed-off-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xen-pciback/pci_stub.c | 46 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 20e1c42c1c48..acec6faff885 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -681,14 +681,14 @@ static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) dev_err(&dev->dev, DRV_NAME " device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } if (!test_bit(_XEN_PCIB_AERHANDLER, (unsigned long *)&psdev->pdev->sh_info->flags)) { dev_err(&dev->dev, "guest with no AER driver should have been killed\n"); - goto release; + goto end; } result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); @@ -698,9 +698,9 @@ static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) "No AER slot_reset service or disconnected!\n"); kill_domain_by_device(psdev); } -release: - pcistub_device_put(psdev); end: + if (psdev) + pcistub_device_put(psdev); up_write(&pcistub_sem); return result; @@ -739,14 +739,14 @@ static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) dev_err(&dev->dev, DRV_NAME " device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } if (!test_bit(_XEN_PCIB_AERHANDLER, (unsigned long *)&psdev->pdev->sh_info->flags)) { dev_err(&dev->dev, "guest with no AER driver should have been killed\n"); - goto release; + goto end; } result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); @@ -756,9 +756,9 @@ static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) "No AER mmio_enabled service or disconnected!\n"); kill_domain_by_device(psdev); } -release: - pcistub_device_put(psdev); end: + if (psdev) + pcistub_device_put(psdev); up_write(&pcistub_sem); return result; } @@ -797,7 +797,7 @@ static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, dev_err(&dev->dev, DRV_NAME " device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } /*Guest owns the device yet no aer handler regiested, kill guest*/ @@ -805,7 +805,7 @@ static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, (unsigned long *)&psdev->pdev->sh_info->flags)) { dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result); @@ -815,9 +815,9 @@ static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, "No AER error_detected service or disconnected!\n"); kill_domain_by_device(psdev); } -release: - pcistub_device_put(psdev); end: + if (psdev) + pcistub_device_put(psdev); up_write(&pcistub_sem); return result; } @@ -851,7 +851,7 @@ static void xen_pcibk_error_resume(struct pci_dev *dev) dev_err(&dev->dev, DRV_NAME " device is not connected or owned" " by HVM, kill it\n"); kill_domain_by_device(psdev); - goto release; + goto end; } if (!test_bit(_XEN_PCIB_AERHANDLER, @@ -859,13 +859,13 @@ static void xen_pcibk_error_resume(struct pci_dev *dev) dev_err(&dev->dev, "guest with no AER driver should have been killed\n"); kill_domain_by_device(psdev); - goto release; + goto end; } common_process(psdev, 1, XEN_PCI_OP_aer_resume, PCI_ERS_RESULT_RECOVERED); -release: - pcistub_device_put(psdev); end: + if (psdev) + pcistub_device_put(psdev); up_write(&pcistub_sem); return; } @@ -1024,7 +1024,7 @@ static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg, struct config_field *field; psdev = pcistub_device_find(domain, bus, slot, func); - if (!psdev || !psdev->dev) { + if (!psdev) { err = -ENODEV; goto out; } @@ -1048,6 +1048,8 @@ static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg, if (err) kfree(field); out: + if (psdev) + pcistub_device_put(psdev); return err; } @@ -1152,10 +1154,9 @@ static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, err = str_to_slot(buf, &domain, &bus, &slot, &func); if (err) - goto out; + return err; psdev = pcistub_device_find(domain, bus, slot, func); - if (!psdev) goto out; @@ -1171,6 +1172,8 @@ static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, if (dev_data->isr_on) dev_data->ack_intr = 1; out: + if (psdev) + pcistub_device_put(psdev); if (!err) err = count; return err; @@ -1262,10 +1265,7 @@ static ssize_t permissive_add(struct device_driver *drv, const char *buf, err = -ENODEV; goto out; } - if (!psdev->dev) { - err = -ENODEV; - goto release; - } + dev_data = pci_get_drvdata(psdev->dev); /* the driver data for a device should never be null at this point */ if (!dev_data) { From c341ca45ce56143804ef5a8f4db753e554e640b4 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 25 Sep 2012 16:48:24 -0400 Subject: [PATCH 48/48] xen/pciback: Restore the PCI config space after an FLR. When we do an FLR, or D0->D3_hot we may lose the BARs as the device has turned itself off (and on). This means the device cannot function unless the pci_restore_state is called - which it is when the PCI device is unbound from the Xen PCI backend driver. For PV guests it ends up calling pci_enable_device / pci_enable_msi[x] which does the proper steps That however is not happening if a HVM guest is run as QEMU deals with PCI configuration space. QEMU also requires that the device be "parked" under the ownership of a pci-stub driver to guarantee that the PCI device is not being used. Hence we follow the same incantation as pci_reset_function does - by doing an FLR, then restoring the PCI configuration space. The result of this patch is that when you run lspci, you get now this: - Region 0: [virtual] Memory at fe8c0000 (32-bit, non-prefetchable) [size=128K] - Region 1: [virtual] Memory at fe800000 (32-bit, non-prefetchable) [size=512K] + Region 0: Memory at fe8c0000 (32-bit, non-prefetchable) [size=128K] + Region 1: Memory at fe800000 (32-bit, non-prefetchable) [size=512K] Region 2: I/O ports at c000 [size=32] - Region 3: [virtual] Memory at fe8e0000 (32-bit, non-prefetchable) [size=16K] + Region 3: Memory at fe8e0000 (32-bit, non-prefetchable) [size=16K] The [virtual] means that lspci read those entries from SysFS but when it read them from the device it got a different value (0xfffffff). CC: stable@vger.kernel.org #only for 3.5, 3.6 Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xen-pciback/pci_stub.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index acec6faff885..e5a0c13e2ad4 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -362,6 +362,7 @@ static int __devinit pcistub_init_device(struct pci_dev *dev) else { dev_dbg(&dev->dev, "reseting (FLR, D3, etc) the device\n"); __pci_reset_function_locked(dev); + pci_restore_state(dev); } /* Now disable the device (this also ensures some private device * data is setup before we export)