From d6e65d54f0fa980f891aa3166d85b6ffd7d541eb Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 2 Oct 2012 13:21:54 -0600 Subject: [PATCH 01/25] pci: Helper function for testing if an INTx route changed Signed-off-by: Alex Williamson Signed-off-by: Michael S. Tsirkin --- hw/pci.c | 5 +++++ hw/pci.h | 1 + 2 files changed, 6 insertions(+) diff --git a/hw/pci.c b/hw/pci.c index d44fd0e10a..8c6b3d19ae 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -1121,6 +1121,11 @@ PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) return bus->route_intx_to_irq(bus->irq_opaque, pin); } +bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new) +{ + return old->mode != new->mode || old->irq != new->irq; +} + void pci_bus_fire_intx_routing_notifier(PCIBus *bus) { PCIDevice *dev; diff --git a/hw/pci.h b/hw/pci.h index 1f902f5b59..a852941a5c 100644 --- a/hw/pci.h +++ b/hw/pci.h @@ -326,6 +326,7 @@ PCIBus *pci_register_bus(DeviceState *parent, const char *name, uint8_t devfn_min, int nirq); void pci_bus_set_route_irq_fn(PCIBus *, pci_route_irq_fn); PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin); +bool pci_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new); void pci_bus_fire_intx_routing_notifier(PCIBus *bus); void pci_device_set_intx_routing_notifier(PCIDevice *dev, PCIINTxRoutingNotifier notifier); From 4774d7b258e0c4a6595a7b0bc6960c1751365bbf Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 2 Oct 2012 13:22:01 -0600 Subject: [PATCH 02/25] pci-assign: Use pci_intx_route_changed() Replace open coded version Signed-off-by: Alex Williamson Signed-off-by: Michael S. Tsirkin --- hw/kvm/pci-assign.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c index bfffbab1b3..ab01205666 100644 --- a/hw/kvm/pci-assign.c +++ b/hw/kvm/pci-assign.c @@ -882,8 +882,7 @@ static int assign_intx(AssignedDevice *dev) intx_route = pci_device_route_intx_to_irq(&dev->dev, dev->intpin); assert(intx_route.mode != PCI_INTX_INVERTED); - if (dev->intx_route.mode == intx_route.mode && - dev->intx_route.irq == intx_route.irq) { + if (!pci_intx_route_changed(&dev->intx_route, &intx_route)) { return 0; } From 39b9bc626a436837f0068cb600aaf91c0e8aa937 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 2 Oct 2012 13:22:07 -0600 Subject: [PATCH 03/25] msi: Add msi_get_message() vfio-pci and pci-assign both do this on their own for setting up direct MSI injection through KVM. Provide a helper function for this in MSI code. Signed-off-by: Alex Williamson Signed-off-by: Michael S. Tsirkin --- hw/msi.c | 45 +++++++++++++++++++++++++++++---------------- hw/msi.h | 1 + 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/hw/msi.c b/hw/msi.c index e2273a09ae..33037a80e9 100644 --- a/hw/msi.c +++ b/hw/msi.c @@ -122,6 +122,31 @@ void msi_set_message(PCIDevice *dev, MSIMessage msg) pci_set_word(dev->config + msi_data_off(dev, msi64bit), msg.data); } +MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector) +{ + uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev)); + bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; + unsigned int nr_vectors = msi_nr_vectors(flags); + MSIMessage msg; + + assert(vector < nr_vectors); + + if (msi64bit) { + msg.address = pci_get_quad(dev->config + msi_address_lo_off(dev)); + } else { + msg.address = pci_get_long(dev->config + msi_address_lo_off(dev)); + } + + /* upper bit 31:16 is zero */ + msg.data = pci_get_word(dev->config + msi_data_off(dev, msi64bit)); + if (nr_vectors > 1) { + msg.data &= ~(nr_vectors - 1); + msg.data |= vector; + } + + return msg; +} + bool msi_enabled(const PCIDevice *dev) { return msi_present(dev) && @@ -249,8 +274,7 @@ void msi_notify(PCIDevice *dev, unsigned int vector) uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev)); bool msi64bit = flags & PCI_MSI_FLAGS_64BIT; unsigned int nr_vectors = msi_nr_vectors(flags); - uint64_t address; - uint32_t data; + MSIMessage msg; assert(vector < nr_vectors); if (msi_is_masked(dev, vector)) { @@ -261,24 +285,13 @@ void msi_notify(PCIDevice *dev, unsigned int vector) return; } - if (msi64bit) { - address = pci_get_quad(dev->config + msi_address_lo_off(dev)); - } else { - address = pci_get_long(dev->config + msi_address_lo_off(dev)); - } - - /* upper bit 31:16 is zero */ - data = pci_get_word(dev->config + msi_data_off(dev, msi64bit)); - if (nr_vectors > 1) { - data &= ~(nr_vectors - 1); - data |= vector; - } + msg = msi_get_message(dev, vector); MSI_DEV_PRINTF(dev, "notify vector 0x%x" " address: 0x%"PRIx64" data: 0x%"PRIx32"\n", - vector, address, data); - stl_le_phys(address, data); + vector, msg.address, msg.data); + stl_le_phys(msg.address, msg.data); } /* Normally called by pci_default_write_config(). */ diff --git a/hw/msi.h b/hw/msi.h index 6ec1f99f80..150b09a19d 100644 --- a/hw/msi.h +++ b/hw/msi.h @@ -32,6 +32,7 @@ struct MSIMessage { extern bool msi_supported; void msi_set_message(PCIDevice *dev, MSIMessage msg); +MSIMessage msi_get_message(PCIDevice *dev, unsigned int vector); bool msi_enabled(const PCIDevice *dev); int msi_init(struct PCIDevice *dev, uint8_t offset, unsigned int nr_vectors, bool msi64bit, bool msi_per_vector_mask); From 2b199f9318ec698dfced65f5d4ad9729f166e37c Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 2 Oct 2012 13:22:14 -0600 Subject: [PATCH 04/25] pci-assign: Use msi_get_message() pci-assign only uses a subset of the flexibility msi_get_message() provides, but it's still worthwhile to use it. Signed-off-by: Alex Williamson Signed-off-by: Michael S. Tsirkin --- hw/kvm/pci-assign.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c index ab01205666..e80dad009c 100644 --- a/hw/kvm/pci-assign.c +++ b/hw/kvm/pci-assign.c @@ -996,12 +996,9 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev) } if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) { - uint8_t *pos = pci_dev->config + pci_dev->msi_cap; - MSIMessage msg; + MSIMessage msg = msi_get_message(pci_dev, 0); int virq; - msg.address = pci_get_long(pos + PCI_MSI_ADDRESS_LO); - msg.data = pci_get_word(pos + PCI_MSI_DATA_32); virq = kvm_irqchip_add_msi_route(kvm_state, msg); if (virq < 0) { perror("assigned_dev_update_msi: kvm_irqchip_add_msi_route"); From 05c0621e64b425d9f89bef542f0b85e61dc57ff8 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 17 Oct 2012 16:13:12 -0600 Subject: [PATCH 05/25] pci: Return PCI_INTX_DISABLED when no bus INTx routing support Rather than assert, simply return PCI_INTX_DISABLED when we don't have a pci_route_irq_fn. PIIX already returns DISABLED for an invalid pin, so users already deal with this state. Users of this interface should only be acting on an ENABLED or INVERTED return value (though we really have no support for INVERTED). Also complain loudly when we hit this so we don't forget it's missing. Signed-off-by: Alex Williamson Signed-off-by: Michael S. Tsirkin Acked-by: Jan Kiszka --- hw/pci.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/pci.c b/hw/pci.c index 8c6b3d19ae..70b5fd965b 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -1117,7 +1117,13 @@ PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) pin = bus->map_irq(dev, pin); dev = bus->parent_dev; } while (dev); - assert(bus->route_intx_to_irq); + + if (!bus->route_intx_to_irq) { + error_report("PCI: Bug - unimplemented PCI INTx routing (%s)\n", + object_get_typename(OBJECT(bus->qbus.parent))); + return (PCIINTxRoute) { PCI_INTX_DISABLED, -1 }; + } + return bus->route_intx_to_irq(bus->irq_opaque, pin); } From dc59944bc9a5ad784572eea57610de60e4a2f4e5 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 18 Oct 2012 00:15:48 +0200 Subject: [PATCH 06/25] qemu: enable PV EOI for qemu 1.3 Enable KVM PV EOI by default. You can still disable it with -kvm_pv_eoi cpu flag. To avoid breaking cross-version migration, enable only for qemu 1.3 (or in the future, newer) machine type. Signed-off-by: Michael S. Tsirkin --- hw/pc_piix.c | 9 ++++++++- target-i386/cpu.c | 33 ++++++++++++++++++++------------- target-i386/cpu.h | 2 ++ 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/hw/pc_piix.c b/hw/pc_piix.c index c7dd75b0c0..85529b2cea 100644 --- a/hw/pc_piix.c +++ b/hw/pc_piix.c @@ -43,6 +43,7 @@ #include "xen.h" #include "memory.h" #include "exec-memory.h" +#include "cpu.h" #ifdef CONFIG_XEN # include #endif @@ -302,6 +303,12 @@ static void pc_init_pci(QEMUMachineInitArgs *args) initrd_filename, cpu_model, 1, 1); } +static void pc_init_pci_1_3(QEMUMachineInitArgs *args) +{ + enable_kvm_pv_eoi(); + pc_init_pci(args); +} + static void pc_init_pci_no_kvmclock(QEMUMachineInitArgs *args) { ram_addr_t ram_size = args->ram_size; @@ -349,7 +356,7 @@ static QEMUMachine pc_machine_v1_3 = { .name = "pc-1.3", .alias = "pc", .desc = "Standard PC", - .init = pc_init_pci, + .init = pc_init_pci_1_3, .max_cpus = 255, .is_default = 1, }; diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 6411042b79..d4f2e65cd9 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -125,6 +125,25 @@ typedef struct model_features_t { int check_cpuid = 0; int enforce_cpuid = 0; +#if defined(CONFIG_KVM) +static uint32_t kvm_default_features = (1 << KVM_FEATURE_CLOCKSOURCE) | + (1 << KVM_FEATURE_NOP_IO_DELAY) | + (1 << KVM_FEATURE_MMU_OP) | + (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_ASYNC_PF) | + (1 << KVM_FEATURE_STEAL_TIME) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); +static const uint32_t kvm_pv_eoi_features = (0x1 << KVM_FEATURE_PV_EOI); +#else +static uint32_t kvm_default_features = 0; +static const uint32_t kvm_pv_eoi_features = 0; +#endif + +void enable_kvm_pv_eoi(void) +{ + kvm_default_features |= kvm_pv_eoi_features; +} + void host_cpuid(uint32_t function, uint32_t count, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { @@ -1108,7 +1127,7 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model) /* Features to be added*/ uint32_t plus_features = 0, plus_ext_features = 0; uint32_t plus_ext2_features = 0, plus_ext3_features = 0; - uint32_t plus_kvm_features = 0, plus_svm_features = 0; + uint32_t plus_kvm_features = kvm_default_features, plus_svm_features = 0; uint32_t plus_7_0_ebx_features = 0; /* Features to be removed */ uint32_t minus_features = 0, minus_ext_features = 0; @@ -1128,18 +1147,6 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model) memcpy(x86_cpu_def, def, sizeof(*def)); } -#if defined(CONFIG_KVM) - plus_kvm_features = (1 << KVM_FEATURE_CLOCKSOURCE) | - (1 << KVM_FEATURE_NOP_IO_DELAY) | - (1 << KVM_FEATURE_MMU_OP) | - (1 << KVM_FEATURE_CLOCKSOURCE2) | - (1 << KVM_FEATURE_ASYNC_PF) | - (1 << KVM_FEATURE_STEAL_TIME) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); -#else - plus_kvm_features = 0; -#endif - add_flagname_to_bitmaps("hypervisor", &plus_features, &plus_ext_features, &plus_ext2_features, &plus_ext3_features, &plus_kvm_features, &plus_svm_features, &plus_7_0_ebx_features); diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 871c270116..de33303dea 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -1188,4 +1188,6 @@ void do_smm_enter(CPUX86State *env1); void cpu_report_tpr_access(CPUX86State *env, TPRAccess access); +void enable_kvm_pv_eoi(void); + #endif /* CPU_I386_H */ From e26631b74663f59b5873a84ed5b92ee4ddf48255 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 22 Oct 2012 12:35:00 +0200 Subject: [PATCH 07/25] pci: make each capability DWORD aligned PCI spec (see e.g. 6.7 Capabilities List in spec rev 3.0) requires that each capability is DWORD aligned. Ensure this when allocating space by rounding size up to 4. Signed-off-by: Michael S. Tsirkin --- hw/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/pci.c b/hw/pci.c index 70b5fd965b..35cb374845 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -1906,7 +1906,7 @@ int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, config[PCI_CAP_LIST_NEXT] = pdev->config[PCI_CAPABILITY_LIST]; pdev->config[PCI_CAPABILITY_LIST] = offset; pdev->config[PCI_STATUS] |= PCI_STATUS_CAP_LIST; - memset(pdev->used + offset, 0xFF, size); + memset(pdev->used + offset, 0xFF, QEMU_ALIGN_UP(size, 4)); /* Make capability read-only by default */ memset(pdev->wmask + offset, 0, size); /* Check capability by default */ @@ -1926,7 +1926,7 @@ void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size) memset(pdev->w1cmask + offset, 0, size); /* Clear cmask as device-specific registers can't be checked */ memset(pdev->cmask + offset, 0, size); - memset(pdev->used + offset, 0, size); + memset(pdev->used + offset, 0, QEMU_ALIGN_UP(size, 4)); if (!pdev->config[PCI_CAPABILITY_LIST]) pdev->config[PCI_STATUS] &= ~PCI_STATUS_CAP_LIST; From b56d701f1d1f1828c9fabea535b3460857546dd0 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Fri, 19 Oct 2012 16:43:28 -0400 Subject: [PATCH 08/25] pci: pci capability must be in PCI space pci capability must be in PCI space. It can't lay in PCIe extended config space. Reviewed-by: Paolo Bonzini Signed-off-by: Isaku Yamahata Signed-off-by: Jason Baron Signed-off-by: Michael S. Tsirkin --- hw/pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/pci.c b/hw/pci.c index 35cb374845..82c971a17d 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -1678,16 +1678,16 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const char *name) return pci_create_simple_multifunction(bus, devfn, false, name); } -static int pci_find_space(PCIDevice *pdev, uint8_t size) +static uint8_t pci_find_space(PCIDevice *pdev, uint8_t size) { - int config_size = pci_config_size(pdev); int offset = PCI_CONFIG_HEADER_SIZE; int i; - for (i = PCI_CONFIG_HEADER_SIZE; i < config_size; ++i) + for (i = PCI_CONFIG_HEADER_SIZE; i < PCI_CONFIG_SPACE_SIZE; ++i) { if (pdev->used[i]) offset = i + 1; else if (i - offset + 1 == size) return offset; + } return 0; } From 9e38f56183c52e06fc29c64691f59a46d246eec5 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Fri, 19 Oct 2012 16:43:31 -0400 Subject: [PATCH 09/25] pci_ids: add intel 82801BA pci-to-pci bridge id Adds pci id constants which will be used by q35. Reviewed-by: Paolo Bonzini Signed-off-by: Isaku Yamahata Signed-off-by: Jason Baron Signed-off-by: Michael S. Tsirkin --- hw/pci_ids.h | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/pci_ids.h b/hw/pci_ids.h index c017a79998..0bfc28f7fb 100644 --- a/hw/pci_ids.h +++ b/hw/pci_ids.h @@ -105,6 +105,7 @@ #define PCI_DEVICE_ID_INTEL_82378 0x0484 #define PCI_DEVICE_ID_INTEL_82441 0x1237 #define PCI_DEVICE_ID_INTEL_82801AA_5 0x2415 +#define PCI_DEVICE_ID_INTEL_82801BA_11 0x244e #define PCI_DEVICE_ID_INTEL_82801D 0x24CD #define PCI_DEVICE_ID_INTEL_ESB_9 0x25ab #define PCI_DEVICE_ID_INTEL_82371SB_0 0x7000 From 91e5615984c1fd5674caad343e750bb5ecd17995 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Fri, 19 Oct 2012 16:43:28 -0400 Subject: [PATCH 10/25] pci: introduce pci_swizzle_map_irq_fn() for standardized interrupt pin swizzle Introduce pci_swizzle_map_irq_fn() for interrupt pin swizzle which is standardized. PCI bridge swizzle is common logic, by introducing this function duplicated swizzle logic will be avoided later. [jbaron@redhat.com: drop opaque argument] Reviewed-by: Paolo Bonzini Signed-off-by: Isaku Yamahata Signed-off-by: Jason Baron Signed-off-by: Michael S. Tsirkin --- hw/pci.c | 18 ++++++++++++++++++ hw/pci.h | 2 ++ 2 files changed, 20 insertions(+) diff --git a/hw/pci.c b/hw/pci.c index 82c971a17d..63b1857f88 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -1155,6 +1155,24 @@ void pci_device_set_intx_routing_notifier(PCIDevice *dev, dev->intx_routing_notifier = notifier; } +/* + * PCI-to-PCI bridge specification + * 9.1: Interrupt routing. Table 9-1 + * + * the PCI Express Base Specification, Revision 2.1 + * 2.2.8.1: INTx interrutp signaling - Rules + * the Implementation Note + * Table 2-20 + */ +/* + * 0 <= pin <= 3 0 = INTA, 1 = INTB, 2 = INTC, 3 = INTD + * 0-origin unlike PCI interrupt pin register. + */ +int pci_swizzle_map_irq_fn(PCIDevice *pci_dev, int pin) +{ + return (pin + PCI_SLOT(pci_dev->devfn)) % PCI_NUM_PINS; +} + /***********************************************************/ /* monitor info on PCI */ diff --git a/hw/pci.h b/hw/pci.h index a852941a5c..241c1d8905 100644 --- a/hw/pci.h +++ b/hw/pci.h @@ -318,6 +318,8 @@ void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq, pci_map_irq_fn map_irq, void *irq_opaque, int nirq); int pci_bus_get_irq_level(PCIBus *bus, int irq_num); void pci_bus_hotplug(PCIBus *bus, pci_hotplug_fn hotplug, DeviceState *dev); +/* 0 <= pin <= 3 0 = INTA, 1 = INTB, 2 = INTC, 3 = INTD */ +int pci_swizzle_map_irq_fn(PCIDevice *pci_dev, int pin); PCIBus *pci_register_bus(DeviceState *parent, const char *name, pci_set_irq_fn set_irq, pci_map_irq_fn map_irq, void *irq_opaque, From f7748569902f4854ac1223c143edbde4f588040f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 19 Oct 2012 16:43:31 -0400 Subject: [PATCH 11/25] pci: Add class 0xc05 as 'SMBus' [jbaron@redhat.com: add PCI_CLASS_SERIAL_SMBUS definition] Reviewed-by: Paolo Bonzini Signed-off-by: Jan Kiszka Signed-off-by: Jason Baron Signed-off-by: Michael S. Tsirkin --- hw/pci.c | 1 + hw/pci_ids.h | 1 + 2 files changed, 2 insertions(+) diff --git a/hw/pci.c b/hw/pci.c index 63b1857f88..dceda0bdc5 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -1237,6 +1237,7 @@ static const pci_class_desc pci_class_descriptions[] = { 0x0c02, "SSA controller", "ssa"}, { 0x0c03, "USB controller", "usb"}, { 0x0c04, "Fibre channel controller", "fibre-channel"}, + { 0x0c05, "SMBus"}, { 0, NULL} }; diff --git a/hw/pci_ids.h b/hw/pci_ids.h index 0bfc28f7fb..41f3570fb9 100644 --- a/hw/pci_ids.h +++ b/hw/pci_ids.h @@ -31,6 +31,7 @@ #define PCI_CLASS_SYSTEM_OTHER 0x0880 #define PCI_CLASS_SERIAL_USB 0x0c03 +#define PCI_CLASS_SERIAL_SMBUS 0x0c05 #define PCI_CLASS_BRIDGE_HOST 0x0600 #define PCI_CLASS_BRIDGE_ISA 0x0601 From c702ddb8daece08b16fce9d6654b38304d385f93 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 19 Oct 2012 16:43:32 -0400 Subject: [PATCH 12/25] pcie: pass pcie window size to pcie_host_mmcfg_update() This allows q35 to pass/set the size of the pcie window in its update routine. Reviewed-by: Paolo Bonzini Signed-off-by: Jason Baron Signed-off-by: Michael S. Tsirkin --- hw/pcie_host.c | 21 ++++++++++++--------- hw/pcie_host.h | 7 ++++--- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/hw/pcie_host.c b/hw/pcie_host.c index 9f7f3d3b1e..2231d28ec3 100644 --- a/hw/pcie_host.c +++ b/hw/pcie_host.c @@ -107,14 +107,9 @@ static const MemoryRegionOps pcie_mmcfg_ops = { /* pcie_host::base_addr == PCIE_BASE_ADDR_UNMAPPED when it isn't mapped. */ #define PCIE_BASE_ADDR_UNMAPPED ((hwaddr)-1ULL) -int pcie_host_init(PCIExpressHost *e, uint32_t size) +int pcie_host_init(PCIExpressHost *e) { - assert(!(size & (size - 1))); /* power of 2 */ - assert(size >= PCIE_MMCFG_SIZE_MIN); - assert(size <= PCIE_MMCFG_SIZE_MAX); e->base_addr = PCIE_BASE_ADDR_UNMAPPED; - e->size = size; - memory_region_init_io(&e->mmio, &pcie_mmcfg_ops, e, "pcie-mmcfg", e->size); return 0; } @@ -123,22 +118,30 @@ void pcie_host_mmcfg_unmap(PCIExpressHost *e) { if (e->base_addr != PCIE_BASE_ADDR_UNMAPPED) { memory_region_del_subregion(get_system_memory(), &e->mmio); + memory_region_destroy(&e->mmio); e->base_addr = PCIE_BASE_ADDR_UNMAPPED; } } -void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr) +void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr, + uint32_t size) { + assert(!(size & (size - 1))); /* power of 2 */ + assert(size >= PCIE_MMCFG_SIZE_MIN); + assert(size <= PCIE_MMCFG_SIZE_MAX); + e->size = size; + memory_region_init_io(&e->mmio, &pcie_mmcfg_ops, e, "pcie-mmcfg", e->size); e->base_addr = addr; memory_region_add_subregion(get_system_memory(), e->base_addr, &e->mmio); } void pcie_host_mmcfg_update(PCIExpressHost *e, int enable, - hwaddr addr) + hwaddr addr, + uint32_t size) { pcie_host_mmcfg_unmap(e); if (enable) { - pcie_host_mmcfg_map(e, addr); + pcie_host_mmcfg_map(e, addr, size); } } diff --git a/hw/pcie_host.h b/hw/pcie_host.h index 9978b9f2f1..73e88db73e 100644 --- a/hw/pcie_host.h +++ b/hw/pcie_host.h @@ -39,11 +39,12 @@ struct PCIExpressHost { MemoryRegion mmio; }; -int pcie_host_init(PCIExpressHost *e, uint32_t size); +int pcie_host_init(PCIExpressHost *e); void pcie_host_mmcfg_unmap(PCIExpressHost *e); -void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr); +void pcie_host_mmcfg_map(PCIExpressHost *e, hwaddr addr, uint32_t size); void pcie_host_mmcfg_update(PCIExpressHost *e, int enable, - hwaddr addr); + hwaddr addr, + uint32_t size); #endif /* PCIE_HOST_H */ From bc927e488c040c8ef2faed323e8a1a3d0ebbfb62 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 19 Oct 2012 16:43:33 -0400 Subject: [PATCH 13/25] pcie: Convert PCIExpressHost to use the QOM. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's use PCIExpressHost with QOM. Reviewed-by: Paolo Bonzini Acked-by: Andreas Färber Signed-off-by: Jason Baron Signed-off-by: Michael S. Tsirkin --- hw/pcie_host.c | 14 ++++++++++++++ hw/pcie_host.h | 4 ++++ 2 files changed, 18 insertions(+) diff --git a/hw/pcie_host.c b/hw/pcie_host.c index 2231d28ec3..c257fb43ca 100644 --- a/hw/pcie_host.c +++ b/hw/pcie_host.c @@ -145,3 +145,17 @@ void pcie_host_mmcfg_update(PCIExpressHost *e, pcie_host_mmcfg_map(e, addr, size); } } + +static const TypeInfo pcie_host_type_info = { + .name = TYPE_PCIE_HOST_BRIDGE, + .parent = TYPE_PCI_HOST_BRIDGE, + .abstract = true, + .instance_size = sizeof(PCIExpressHost), +}; + +static void pcie_host_register_types(void) +{ + type_register_static(&pcie_host_type_info); +} + +type_init(pcie_host_register_types) diff --git a/hw/pcie_host.h b/hw/pcie_host.h index 73e88db73e..392193530d 100644 --- a/hw/pcie_host.h +++ b/hw/pcie_host.h @@ -24,6 +24,10 @@ #include "pci_host.h" #include "memory.h" +#define TYPE_PCIE_HOST_BRIDGE "pcie-host-bridge" +#define PCIE_HOST_BRIDGE(obj) \ + OBJECT_CHECK(PCIExpressHost, (obj), TYPE_PCIE_HOST_BRIDGE) + struct PCIExpressHost { PCIHostState pci; From e35e23f655fb120a2b4d0695bdee86fafd9caabf Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 12:12:25 +0200 Subject: [PATCH 14/25] virtio-net: track host/guest header length Tracking these in device state instead of re-calculating on each packet. No functional changes. Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 50ba728c02..a0c79e1d1b 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -41,6 +41,8 @@ typedef struct VirtIONet int32_t tx_burst; int tx_waiting; uint32_t has_vnet_hdr; + size_t host_hdr_len; + size_t guest_hdr_len; uint8_t has_ufo; struct { VirtQueueElement elem; @@ -231,6 +233,7 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) if (peer_has_vnet_hdr(n)) { tap_using_vnet_hdr(n->nic->nc.peer, 1); + n->host_hdr_len = sizeof(struct virtio_net_hdr); } else { features &= ~(0x1 << VIRTIO_NET_F_CSUM); features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO4); @@ -278,6 +281,8 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) VirtIONet *n = to_virtio_net(vdev); n->mergeable_rx_bufs = !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)); + n->guest_hdr_len = n->mergeable_rx_bufs ? + sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); if (n->has_vnet_hdr) { tap_set_offload(n->nic->nc.peer, @@ -593,18 +598,13 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t { VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque; struct virtio_net_hdr_mrg_rxbuf *mhdr = NULL; - size_t guest_hdr_len, offset, i, host_hdr_len; + size_t offset, i; if (!virtio_net_can_receive(&n->nic->nc)) return -1; /* hdr_len refers to the header we supply to the guest */ - guest_hdr_len = n->mergeable_rx_bufs ? - sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); - - - host_hdr_len = n->has_vnet_hdr ? sizeof(struct virtio_net_hdr) : 0; - if (!virtio_net_has_buffers(n, size + guest_hdr_len - host_hdr_len)) + if (!virtio_net_has_buffers(n, size + n->guest_hdr_len - n->host_hdr_len)) return 0; if (!receive_filter(n, buf, size)) @@ -626,7 +626,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t "i %zd mergeable %d offset %zd, size %zd, " "guest hdr len %zd, host hdr len %zd guest features 0x%x", i, n->mergeable_rx_bufs, offset, size, - guest_hdr_len, host_hdr_len, n->vdev.guest_features); + n->guest_hdr_len, n->host_hdr_len, n->vdev.guest_features); exit(1); } @@ -635,7 +635,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t exit(1); } - if (!n->mergeable_rx_bufs && elem.in_sg[0].iov_len != guest_hdr_len) { + if (!n->mergeable_rx_bufs && elem.in_sg[0].iov_len != n->guest_hdr_len) { error_report("virtio-net header not in first element"); exit(1); } @@ -647,8 +647,9 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t mhdr = (struct virtio_net_hdr_mrg_rxbuf *)sg[0].iov_base; offset += receive_header(n, sg, elem.in_num, - buf + offset, size - offset, guest_hdr_len); - total += guest_hdr_len; + buf + offset, size - offset, + n->guest_hdr_len); + total += n->guest_hdr_len; } /* copy in packet. ugh */ @@ -665,7 +666,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t "i %zd mergeable %d offset %zd, size %zd, " "guest hdr len %zd, host hdr len %zd", i, n->mergeable_rx_bufs, - offset, size, guest_hdr_len, host_hdr_len); + offset, size, n->guest_hdr_len, n->host_hdr_len); #endif return size; } @@ -900,6 +901,8 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) qemu_get_buffer(f, n->mac, ETH_ALEN); n->tx_waiting = qemu_get_be32(f); n->mergeable_rx_bufs = qemu_get_be32(f); + n->guest_hdr_len = n->mergeable_rx_bufs ? + sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); if (version_id >= 3) n->status = qemu_get_be16(f); @@ -940,6 +943,7 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) if (n->has_vnet_hdr) { tap_using_vnet_hdr(n->nic->nc.peer, 1); + n->host_hdr_len = sizeof(struct virtio_net_hdr); tap_set_offload(n->nic->nc.peer, (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_CSUM) & 1, (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO4) & 1, @@ -1044,6 +1048,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, n->tx_waiting = 0; n->tx_burst = net->txburst; n->mergeable_rx_bufs = 0; + n->guest_hdr_len = sizeof(struct virtio_net_hdr); n->promisc = 1; /* for compatibility */ n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN); From d336336c8164859e4527cbb9f3df189f8bb406de Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 13:02:52 +0200 Subject: [PATCH 15/25] iov: add iov_cpy Add API to copy part of iovec safely. Signed-off-by: Michael S. Tsirkin --- iov.c | 23 +++++++++++++++++++++++ iov.h | 9 +++++++++ 2 files changed, 32 insertions(+) diff --git a/iov.c b/iov.c index c6a66f0afe..b7378bf7ce 100644 --- a/iov.c +++ b/iov.c @@ -228,3 +228,26 @@ void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt, fprintf(fp, "\n"); } } + +unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, + const struct iovec *iov, unsigned int iov_cnt, + size_t offset, size_t bytes) +{ + size_t len; + unsigned int i, j; + for (i = 0, j = 0; i < iov_cnt && j < dst_iov_cnt && bytes; i++) { + if (offset >= iov[i].iov_len) { + offset -= iov[i].iov_len; + continue; + } + len = MIN(bytes, iov[i].iov_len - offset); + + dst_iov[j].iov_base = iov[i].iov_base + offset; + dst_iov[j].iov_len = len; + j++; + bytes -= len; + offset = 0; + } + assert(offset == 0); + return j; +} diff --git a/iov.h b/iov.h index a73569f94e..34c8ec9faa 100644 --- a/iov.h +++ b/iov.h @@ -86,3 +86,12 @@ ssize_t iov_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, */ void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt, FILE *fp, const char *prefix, size_t limit); + +/* + * Partial copy of vector from iov to dst_iov (data is not copied). + * dst_iov overlaps iov at a specified offset. + * size of dst_iov is at most bytes. dst vector count is returned. + */ +unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, + const struct iovec *iov, unsigned int iov_cnt, + size_t offset, size_t bytes); From 22cc84db6e42bef8646b8cd671f4c999e9c0a38f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 13:14:16 +0200 Subject: [PATCH 16/25] virtio-net: avoid sg copy Avoid tweaking iovec during receive. This removes the need to copy the vector. Note: we currently have an evil cast in work_around_broken_dhclient and unfortunately this patch does not fix it - just pushes the evil cast to another place. Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index a0c79e1d1b..84615869cb 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -504,40 +504,37 @@ static int virtio_net_has_buffers(VirtIONet *n, int bufsize) * cache. */ static void work_around_broken_dhclient(struct virtio_net_hdr *hdr, - const uint8_t *buf, size_t size) + uint8_t *buf, size_t size) { if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */ (size > 27 && size < 1500) && /* normal sized MTU */ (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */ (buf[23] == 17) && /* ip.protocol == UDP */ (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */ - /* FIXME this cast is evil */ - net_checksum_calculate((uint8_t *)buf, size); + net_checksum_calculate(buf, size); hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM; } } -static int receive_header(VirtIONet *n, struct iovec *iov, int iovcnt, - const void *buf, size_t size, size_t hdr_len) +static int receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt, + const void *buf, size_t size) { - struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)iov[0].iov_base; int offset = 0; - hdr->flags = 0; - hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; - if (n->has_vnet_hdr) { - memcpy(hdr, buf, sizeof(*hdr)); - offset = sizeof(*hdr); - work_around_broken_dhclient(hdr, buf + offset, size - offset); + /* FIXME this cast is evil */ + void *wbuf = (void *)buf; + work_around_broken_dhclient(wbuf, wbuf + offset, size - offset); + offset = sizeof(struct virtio_net_hdr); + iov_from_buf(iov, iov_cnt, 0, buf, offset); + } else { + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr); } - /* We only ever receive a struct virtio_net_hdr from the tapfd, - * but we may be passing along a larger header to the guest. - */ - iov[0].iov_base += hdr_len; - iov[0].iov_len -= hdr_len; - return offset; } @@ -598,7 +595,8 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t { VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque; struct virtio_net_hdr_mrg_rxbuf *mhdr = NULL; - size_t offset, i; + const struct iovec *sg = elem.in_sg; + size_t offset, i, guest_offset; if (!virtio_net_can_receive(&n->nic->nc)) return -1; @@ -615,7 +613,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t while (offset < size) { VirtQueueElement elem; int len, total; - struct iovec sg[VIRTQUEUE_MAX_SIZE]; + const struct iovec *sg = elem.in_sg; total = 0; @@ -640,20 +638,20 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t exit(1); } - memcpy(&sg, &elem.in_sg[0], sizeof(sg[0]) * elem.in_num); - if (i == 0) { if (n->mergeable_rx_bufs) mhdr = (struct virtio_net_hdr_mrg_rxbuf *)sg[0].iov_base; offset += receive_header(n, sg, elem.in_num, - buf + offset, size - offset, - n->guest_hdr_len); + buf + offset, size - offset); total += n->guest_hdr_len; + guest_offset = n->guest_hdr_len; + } else { + guest_offset = 0; } /* copy in packet. ugh */ - len = iov_from_buf(sg, elem.in_num, 0, + len = iov_from_buf(sg, elem.in_num, guest_offset, buf + offset, size - offset); total += len; offset += len; From 63c5872873de8d7d994a589eed7bfe6a70cc8e06 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 13:17:13 +0200 Subject: [PATCH 17/25] virtio-net: use safe iov operations for rx Avoid magling iov manually: use safe iov operations for processing packets incoming to guest. This also removes the requirement for virtio header to fit the first s/g entry exactly. Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 84615869cb..d315fdf0a4 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -594,8 +594,9 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size) { VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque; - struct virtio_net_hdr_mrg_rxbuf *mhdr = NULL; - const struct iovec *sg = elem.in_sg; + struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; + struct virtio_net_hdr_mrg_rxbuf mhdr; + unsigned mhdr_cnt = 0; size_t offset, i, guest_offset; if (!virtio_net_can_receive(&n->nic->nc)) @@ -633,14 +634,13 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t exit(1); } - if (!n->mergeable_rx_bufs && elem.in_sg[0].iov_len != n->guest_hdr_len) { - error_report("virtio-net header not in first element"); - exit(1); - } - if (i == 0) { - if (n->mergeable_rx_bufs) - mhdr = (struct virtio_net_hdr_mrg_rxbuf *)sg[0].iov_base; + if (n->mergeable_rx_bufs) { + mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg), + sg, elem.in_num, + offsetof(typeof(mhdr), num_buffers), + sizeof(mhdr.num_buffers)); + } offset += receive_header(n, sg, elem.in_num, buf + offset, size - offset); @@ -673,8 +673,11 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t virtqueue_fill(n->rx_vq, &elem, total, i++); } - if (mhdr) { - stw_p(&mhdr->num_buffers, i); + if (mhdr_cnt) { + stw_p(&mhdr.num_buffers, i); + iov_from_buf(mhdr_sg, mhdr_cnt, + 0, + &mhdr.num_buffers, sizeof mhdr.num_buffers); } virtqueue_flush(n->rx_vq, i); From 280598b7a5fdf96fb79d87a2129750bad5dbf24b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 13:24:17 +0200 Subject: [PATCH 18/25] virtio-net: refactor receive_hdr Now that we know host hdr length, we don't need to duplicate the logic in receive_hdr: caller can figure out the offset itself. Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index d315fdf0a4..5128a2a8ba 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -516,17 +516,15 @@ static void work_around_broken_dhclient(struct virtio_net_hdr *hdr, } } -static int receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt, - const void *buf, size_t size) +static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt, + const void *buf, size_t size) { - int offset = 0; - if (n->has_vnet_hdr) { /* FIXME this cast is evil */ void *wbuf = (void *)buf; - work_around_broken_dhclient(wbuf, wbuf + offset, size - offset); - offset = sizeof(struct virtio_net_hdr); - iov_from_buf(iov, iov_cnt, 0, buf, offset); + work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len, + size - n->host_hdr_len); + iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr)); } else { struct virtio_net_hdr hdr = { .flags = 0, @@ -534,8 +532,6 @@ static int receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt, }; iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr); } - - return offset; } static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) @@ -642,8 +638,8 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t sizeof(mhdr.num_buffers)); } - offset += receive_header(n, sg, elem.in_num, - buf + offset, size - offset); + receive_header(n, sg, elem.in_num, buf + offset, size - offset); + offset += n->host_hdr_len; total += n->guest_hdr_len; guest_offset = n->guest_hdr_len; } else { From c8d28e7e336869524d166d88f08ad476eadedccb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 13:26:55 +0200 Subject: [PATCH 19/25] virtio-net: first s/g is always at start of buf We know offset is 0, assert that. Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 5128a2a8ba..59ab6746f0 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -631,6 +631,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t } if (i == 0) { + assert(offset == 0); if (n->mergeable_rx_bufs) { mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg), sg, elem.in_num, @@ -638,8 +639,8 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t sizeof(mhdr.num_buffers)); } - receive_header(n, sg, elem.in_num, buf + offset, size - offset); - offset += n->host_hdr_len; + receive_header(n, sg, elem.in_num, buf, size); + offset = n->host_hdr_len; total += n->guest_hdr_len; guest_offset = n->guest_hdr_len; } else { From 14761f9cf7fbc6d058c1e51c313a139066eab256 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 14:52:28 +0200 Subject: [PATCH 20/25] virtio-net: switch tx to safe iov functions Avoid mangling iovec manually: use safe iov_* functions. Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 59ab6746f0..5206648271 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -715,10 +715,10 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) } while (virtqueue_pop(vq, &elem)) { - ssize_t ret, len = 0; + ssize_t ret, len; unsigned int out_num = elem.out_num; struct iovec *out_sg = &elem.out_sg[0]; - unsigned hdr_len; + struct iovec sg[VIRTQUEUE_MAX_SIZE]; /* hdr_len refers to the header received from the guest */ hdr_len = n->mergeable_rx_bufs ? @@ -730,18 +730,25 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) exit(1); } - /* ignore the header if GSO is not supported */ - if (!n->has_vnet_hdr) { - out_num--; - out_sg++; - len += hdr_len; - } else if (n->mergeable_rx_bufs) { - /* tapfd expects a struct virtio_net_hdr */ - hdr_len -= sizeof(struct virtio_net_hdr); - out_sg->iov_len -= hdr_len; - len += hdr_len; + /* + * If host wants to see the guest header as is, we can + * pass it on unchanged. Otherwise, copy just the parts + * that host is interested in. + */ + assert(n->host_hdr_len <= n->guest_hdr_len); + if (n->host_hdr_len != n->guest_hdr_len) { + unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), + out_sg, out_num, + 0, n->host_hdr_len); + sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num, + out_sg, out_num, + n->guest_hdr_len, -1); + out_num = sg_num; + out_sg = sg; } + len = hdr_len; + ret = qemu_sendv_packet_async(&n->nic->nc, out_sg, out_num, virtio_net_tx_complete); if (ret == 0) { From 7b80d08efc36fd6c7881e98302f00148b5fd908a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 14:54:44 +0200 Subject: [PATCH 21/25] virtio-net: simplify rx code Remove code duplication using guest header length that we track. Drop specific layout requirement for rx buffers: things work using generic iovec functions in any case. Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 5206648271..dc4a26cbd6 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -720,12 +720,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) struct iovec *out_sg = &elem.out_sg[0]; struct iovec sg[VIRTQUEUE_MAX_SIZE]; - /* hdr_len refers to the header received from the guest */ - hdr_len = n->mergeable_rx_bufs ? - sizeof(struct virtio_net_hdr_mrg_rxbuf) : - sizeof(struct virtio_net_hdr); - - if (out_num < 1 || out_sg->iov_len != hdr_len) { + if (out_num < 1) { error_report("virtio-net header not in first element"); exit(1); } @@ -747,7 +742,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) out_sg = sg; } - len = hdr_len; + len = n->guest_hdr_len; ret = qemu_sendv_packet_async(&n->nic->nc, out_sg, out_num, virtio_net_tx_complete); From e043ebc6f9a093c1fd1b677191ad9fbeefe22d1e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 16:27:27 +0200 Subject: [PATCH 22/25] virtio-net: minor code simplification During packet filtering, we can now use host hdr len to offset incoming buffer unconditionally. Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index dc4a26cbd6..c94521e0fe 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -544,9 +544,7 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) if (n->promisc) return 1; - if (n->has_vnet_hdr) { - ptr += sizeof(struct virtio_net_hdr); - } + ptr += n->host_hdr_len; if (!memcmp(&ptr[12], vlan, sizeof(vlan))) { int vid = be16_to_cpup((uint16_t *)(ptr + 14)) & 0xfff; From 6e371ab867d48c16e6a9ee32cefcea48692a4deb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 17:04:21 +0200 Subject: [PATCH 23/25] virtio-net: test peer header support at init time There's no reason to query header support at random times: at load or feature query. Driver also might not query functions. Cleaner to do it at device init. Signed-off-by: Michael S. Tsirkin --- hw/virtio-net.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index c94521e0fe..1180b51ac6 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -202,16 +202,19 @@ static void virtio_net_reset(VirtIODevice *vdev) memset(n->vlans, 0, MAX_VLAN >> 3); } -static int peer_has_vnet_hdr(VirtIONet *n) +static void peer_test_vnet_hdr(VirtIONet *n) { if (!n->nic->nc.peer) - return 0; + return; if (n->nic->nc.peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) - return 0; + return; n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer); +} +static int peer_has_vnet_hdr(VirtIONet *n) +{ return n->has_vnet_hdr; } @@ -231,10 +234,7 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) features |= (1 << VIRTIO_NET_F_MAC); - if (peer_has_vnet_hdr(n)) { - tap_using_vnet_hdr(n->nic->nc.peer, 1); - n->host_hdr_len = sizeof(struct virtio_net_hdr); - } else { + if (!peer_has_vnet_hdr(n)) { features &= ~(0x1 << VIRTIO_NET_F_CSUM); features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO4); features &= ~(0x1 << VIRTIO_NET_F_HOST_TSO6); @@ -940,8 +940,6 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) } if (n->has_vnet_hdr) { - tap_using_vnet_hdr(n->nic->nc.peer, 1); - n->host_hdr_len = sizeof(struct virtio_net_hdr); tap_set_offload(n->nic->nc.peer, (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_CSUM) & 1, (n->vdev.guest_features >> VIRTIO_NET_F_GUEST_TSO4) & 1, @@ -1040,6 +1038,13 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, n->status = VIRTIO_NET_S_LINK_UP; n->nic = qemu_new_nic(&net_virtio_info, conf, object_get_typename(OBJECT(dev)), dev->id, n); + peer_test_vnet_hdr(n); + if (peer_has_vnet_hdr(n)) { + tap_using_vnet_hdr(n->nic->nc.peer, 1); + n->host_hdr_len = sizeof(struct virtio_net_hdr); + } else { + n->host_hdr_len = 0; + } qemu_format_nic_info_str(&n->nic->nc, conf->macaddr.a); From ff3a8066e651230d255e6eea340e2d48e7da4aeb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Sep 2012 21:05:03 +0200 Subject: [PATCH 24/25] virtio-net: enable mrg buf header in tap on linux Modern linux supports arbitrary header size, which makes it possible to pass mrg buf header to tap directly without iovec mangling. Use this capability when it is there. This removes the need to deal with it in vhost-net as we do now. Signed-off-by: Michael S. Tsirkin --- hw/vhost_net.c | 13 ------------- hw/virtio-net.c | 26 ++++++++++++++++++-------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/hw/vhost_net.c b/hw/vhost_net.c index df2c4a30a2..8241601539 100644 --- a/hw/vhost_net.c +++ b/hw/vhost_net.c @@ -150,10 +150,6 @@ int vhost_net_start(struct vhost_net *net, if (r < 0) { goto fail_notifiers; } - if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - tap_set_vnet_hdr_len(net->nc, - sizeof(struct virtio_net_hdr_mrg_rxbuf)); - } r = vhost_dev_start(&net->dev, dev); if (r < 0) { @@ -179,9 +175,6 @@ fail: } net->nc->info->poll(net->nc, true); vhost_dev_stop(&net->dev, dev); - if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - tap_set_vnet_hdr_len(net->nc, sizeof(struct virtio_net_hdr)); - } fail_start: vhost_dev_disable_notifiers(&net->dev, dev); fail_notifiers: @@ -199,18 +192,12 @@ void vhost_net_stop(struct vhost_net *net, } net->nc->info->poll(net->nc, true); vhost_dev_stop(&net->dev, dev); - if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - tap_set_vnet_hdr_len(net->nc, sizeof(struct virtio_net_hdr)); - } vhost_dev_disable_notifiers(&net->dev, dev); } void vhost_net_cleanup(struct vhost_net *net) { vhost_dev_cleanup(&net->dev); - if (net->dev.acked_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - tap_set_vnet_hdr_len(net->nc, sizeof(struct virtio_net_hdr)); - } g_free(net); } #else diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 1180b51ac6..108ce07cfc 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -228,6 +228,20 @@ static int peer_has_ufo(VirtIONet *n) return n->has_ufo; } +static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs) +{ + n->mergeable_rx_bufs = mergeable_rx_bufs; + + n->guest_hdr_len = n->mergeable_rx_bufs ? + sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); + + if (peer_has_vnet_hdr(n) && + tap_has_vnet_hdr_len(n->nic->nc.peer, n->guest_hdr_len)) { + tap_set_vnet_hdr_len(n->nic->nc.peer, n->guest_hdr_len); + n->host_hdr_len = n->guest_hdr_len; + } +} + static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) { VirtIONet *n = to_virtio_net(vdev); @@ -280,9 +294,7 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) { VirtIONet *n = to_virtio_net(vdev); - n->mergeable_rx_bufs = !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)); - n->guest_hdr_len = n->mergeable_rx_bufs ? - sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); + virtio_net_set_mrg_rx_bufs(n, !!(features & (1 << VIRTIO_NET_F_MRG_RXBUF))); if (n->has_vnet_hdr) { tap_set_offload(n->nic->nc.peer, @@ -898,9 +910,8 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) qemu_get_buffer(f, n->mac, ETH_ALEN); n->tx_waiting = qemu_get_be32(f); - n->mergeable_rx_bufs = qemu_get_be32(f); - n->guest_hdr_len = n->mergeable_rx_bufs ? - sizeof(struct virtio_net_hdr_mrg_rxbuf) : sizeof(struct virtio_net_hdr); + + virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f)); if (version_id >= 3) n->status = qemu_get_be16(f); @@ -1050,8 +1061,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, n->tx_waiting = 0; n->tx_burst = net->txburst; - n->mergeable_rx_bufs = 0; - n->guest_hdr_len = sizeof(struct virtio_net_hdr); + virtio_net_set_mrg_rx_bufs(n, 0); n->promisc = 1; /* for compatibility */ n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN); From 523a59f596a3e62f5a28eb171adba35e71310040 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 25 Oct 2012 12:37:57 +0200 Subject: [PATCH 25/25] pci: avoid destroying bridge address space windows in a transaction Calling memory_region_destroy() in a transaction is illegal (and aborts), as until the transaction is committed, the region remains live. Fix by moving destruction until after the transaction commits. This requires having an extra set of regions, so the new and old regions can coexist. Signed-off-by: Avi Kivity Signed-off-by: Michael S. Tsirkin --- hw/pci_bridge.c | 50 ++++++++++++++++++++++++++-------------------- hw/pci_internals.h | 24 ++++++++++++++-------- 2 files changed, 44 insertions(+), 30 deletions(-) diff --git a/hw/pci_bridge.c b/hw/pci_bridge.c index 5c6455f6fa..4680501e4e 100644 --- a/hw/pci_bridge.c +++ b/hw/pci_bridge.c @@ -151,58 +151,63 @@ static void pci_bridge_init_alias(PCIBridge *bridge, MemoryRegion *alias, memory_region_add_subregion_overlap(parent_space, base, alias, 1); } -static void pci_bridge_cleanup_alias(MemoryRegion *alias, - MemoryRegion *parent_space) -{ - memory_region_del_subregion(parent_space, alias); - memory_region_destroy(alias); -} - -static void pci_bridge_region_init(PCIBridge *br) +static PCIBridgeWindows *pci_bridge_region_init(PCIBridge *br) { PCIBus *parent = br->dev.bus; + PCIBridgeWindows *w = g_new(PCIBridgeWindows, 1); uint16_t cmd = pci_get_word(br->dev.config + PCI_COMMAND); - pci_bridge_init_alias(br, &br->alias_pref_mem, + pci_bridge_init_alias(br, &w->alias_pref_mem, PCI_BASE_ADDRESS_MEM_PREFETCH, "pci_bridge_pref_mem", &br->address_space_mem, parent->address_space_mem, cmd & PCI_COMMAND_MEMORY); - pci_bridge_init_alias(br, &br->alias_mem, + pci_bridge_init_alias(br, &w->alias_mem, PCI_BASE_ADDRESS_SPACE_MEMORY, "pci_bridge_mem", &br->address_space_mem, parent->address_space_mem, cmd & PCI_COMMAND_MEMORY); - pci_bridge_init_alias(br, &br->alias_io, + pci_bridge_init_alias(br, &w->alias_io, PCI_BASE_ADDRESS_SPACE_IO, "pci_bridge_io", &br->address_space_io, parent->address_space_io, cmd & PCI_COMMAND_IO); /* TODO: optinal VGA and VGA palette snooping support. */ + + return w; } -static void pci_bridge_region_cleanup(PCIBridge *br) +static void pci_bridge_region_del(PCIBridge *br, PCIBridgeWindows *w) { PCIBus *parent = br->dev.bus; - pci_bridge_cleanup_alias(&br->alias_io, - parent->address_space_io); - pci_bridge_cleanup_alias(&br->alias_mem, - parent->address_space_mem); - pci_bridge_cleanup_alias(&br->alias_pref_mem, - parent->address_space_mem); + + memory_region_del_subregion(parent->address_space_io, &w->alias_io); + memory_region_del_subregion(parent->address_space_mem, &w->alias_mem); + memory_region_del_subregion(parent->address_space_mem, &w->alias_pref_mem); +} + +static void pci_bridge_region_cleanup(PCIBridge *br, PCIBridgeWindows *w) +{ + memory_region_destroy(&w->alias_io); + memory_region_destroy(&w->alias_mem); + memory_region_destroy(&w->alias_pref_mem); + g_free(w); } static void pci_bridge_update_mappings(PCIBridge *br) { + PCIBridgeWindows *w = br->windows; + /* Make updates atomic to: handle the case of one VCPU updating the bridge * while another accesses an unaffected region. */ memory_region_transaction_begin(); - pci_bridge_region_cleanup(br); - pci_bridge_region_init(br); + pci_bridge_region_del(br, br->windows); + br->windows = pci_bridge_region_init(br); memory_region_transaction_commit(); + pci_bridge_region_cleanup(br, w); } /* default write_config function for PCI-to-PCI bridge */ @@ -326,7 +331,7 @@ int pci_bridge_initfn(PCIDevice *dev) memory_region_init(&br->address_space_mem, "pci_bridge_pci", INT64_MAX); sec_bus->address_space_io = &br->address_space_io; memory_region_init(&br->address_space_io, "pci_bridge_io", 65536); - pci_bridge_region_init(br); + br->windows = pci_bridge_region_init(br); QLIST_INIT(&sec_bus->child); QLIST_INSERT_HEAD(&parent->child, sec_bus, sibling); return 0; @@ -338,7 +343,8 @@ void pci_bridge_exitfn(PCIDevice *pci_dev) PCIBridge *s = DO_UPCAST(PCIBridge, dev, pci_dev); assert(QLIST_EMPTY(&s->sec_bus.child)); QLIST_REMOVE(&s->sec_bus, sibling); - pci_bridge_region_cleanup(s); + pci_bridge_region_del(s, s->windows); + pci_bridge_region_cleanup(s, s->windows); memory_region_destroy(&s->address_space_mem); memory_region_destroy(&s->address_space_io); /* qbus_free() is called automatically by qdev_free() */ diff --git a/hw/pci_internals.h b/hw/pci_internals.h index c931b64b46..21d0ce6973 100644 --- a/hw/pci_internals.h +++ b/hw/pci_internals.h @@ -40,6 +40,19 @@ struct PCIBus { int *irq_count; }; +typedef struct PCIBridgeWindows PCIBridgeWindows; + +/* + * Aliases for each of the address space windows that the bridge + * can forward. Mapped into the bridge's parent's address space, + * as subregions. + */ +struct PCIBridgeWindows { + MemoryRegion alias_pref_mem; + MemoryRegion alias_mem; + MemoryRegion alias_io; +}; + struct PCIBridge { PCIDevice dev; @@ -55,14 +68,9 @@ struct PCIBridge { */ MemoryRegion address_space_mem; MemoryRegion address_space_io; - /* - * Aliases for each of the address space windows that the bridge - * can forward. Mapped into the bridge's parent's address space, - * as subregions. - */ - MemoryRegion alias_pref_mem; - MemoryRegion alias_mem; - MemoryRegion alias_io; + + PCIBridgeWindows *windows; + pci_map_irq_fn map_irq; const char *bus_name; };