From caf108bc587908f7608729f39639dccdfa53010b Mon Sep 17 00:00:00 2001 From: Julia Suvorova Date: Tue, 13 Jul 2021 02:42:00 +0200 Subject: [PATCH 01/19] hw/i386/acpi-build: Add ACPI PCI hot-plug methods to Q35 Implement notifications and gpe to support q35 ACPI PCI hot-plug. Use 0xcc4 - 0xcd7 range for 'acpi-pci-hotplug' io ports. Signed-off-by: Julia Suvorova Reviewed-by: Igor Mammedov Reviewed-by: Marcel Apfelbaum Message-Id: <20210713004205.775386-2-jusual@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: David Gibson --- hw/acpi/pcihp.c | 6 +++--- hw/acpi/piix4.c | 4 +++- hw/i386/acpi-build.c | 30 +++++++++++++++++++----------- hw/i386/acpi-build.h | 4 ++++ include/hw/acpi/ich9.h | 2 ++ include/hw/acpi/pcihp.h | 3 ++- 6 files changed, 33 insertions(+), 16 deletions(-) diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index 4999277d57..d98a284b7a 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -37,7 +37,6 @@ #include "qom/qom-qobject.h" #include "trace.h" -#define ACPI_PCIHP_ADDR 0xae00 #define ACPI_PCIHP_SIZE 0x0018 #define PCI_UP_BASE 0x0000 #define PCI_DOWN_BASE 0x0004 @@ -488,10 +487,11 @@ static const MemoryRegionOps acpi_pcihp_io_ops = { }; void acpi_pcihp_init(Object *owner, AcpiPciHpState *s, PCIBus *root_bus, - MemoryRegion *address_space_io, bool bridges_enabled) + MemoryRegion *address_space_io, bool bridges_enabled, + uint16_t io_base) { s->io_len = ACPI_PCIHP_SIZE; - s->io_base = ACPI_PCIHP_ADDR; + s->io_base = io_base; s->root = root_bus; s->legacy_piix = !bridges_enabled; diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index 0bd23d74e2..48f7a1edbc 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -49,6 +49,8 @@ #define GPE_BASE 0xafe0 #define GPE_LEN 4 +#define ACPI_PCIHP_ADDR_PIIX4 0xae00 + struct pci_status { uint32_t up; /* deprecated, maintained for migration compatibility */ uint32_t down; @@ -607,7 +609,7 @@ static void piix4_acpi_system_hot_add_init(MemoryRegion *parent, if (s->use_acpi_hotplug_bridge || s->use_acpi_root_pci_hotplug) { acpi_pcihp_init(OBJECT(s), &s->acpi_pci_hotplug, bus, parent, - s->use_acpi_hotplug_bridge); + s->use_acpi_hotplug_bridge, ACPI_PCIHP_ADDR_PIIX4); } s->cpu_hotplug_legacy = true; diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 357437ff1d..e1c246d6e8 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -219,10 +219,6 @@ static void acpi_get_pm_info(MachineState *machine, AcpiPmInfo *pm) /* w2k requires FADT(rev1) or it won't boot, keep PC compatible */ pm->fadt.rev = 1; pm->cpu_hp_io_base = PIIX4_CPU_HOTPLUG_IO_BASE; - pm->pcihp_io_base = - object_property_get_uint(obj, ACPI_PCIHP_IO_BASE_PROP, NULL); - pm->pcihp_io_len = - object_property_get_uint(obj, ACPI_PCIHP_IO_LEN_PROP, NULL); } if (lpc) { uint64_t smi_features = object_property_get_uint(lpc, @@ -238,6 +234,10 @@ static void acpi_get_pm_info(MachineState *machine, AcpiPmInfo *pm) pm->smi_on_cpu_unplug = !!(smi_features & BIT_ULL(ICH9_LPC_SMI_F_CPU_HOT_UNPLUG_BIT)); } + pm->pcihp_io_base = + object_property_get_uint(obj, ACPI_PCIHP_IO_BASE_PROP, NULL); + pm->pcihp_io_len = + object_property_get_uint(obj, ACPI_PCIHP_IO_LEN_PROP, NULL); /* The above need not be conditional on machine type because the reset port * happens to be the same on PIIX (pc) and ICH9 (q35). */ @@ -392,6 +392,9 @@ static void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus, if (!pdev) { if (bsel) { /* add hotplug slots for non present devices */ + if (pci_bus_is_express(bus) && slot > 0) { + break; + } dev = aml_device("S%.02X", PCI_DEVFN(slot, 0)); aml_append(dev, aml_name_decl("_SUN", aml_int(slot))); aml_append(dev, aml_name_decl("_ADR", aml_int(slot << 16))); @@ -521,7 +524,7 @@ static void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus, QLIST_FOREACH(sec, &bus->child, sibling) { int32_t devfn = sec->parent_dev->devfn; - if (pci_bus_is_root(sec) || pci_bus_is_express(sec)) { + if (pci_bus_is_root(sec)) { continue; } @@ -1251,7 +1254,7 @@ static void build_piix4_isa_bridge(Aml *table) aml_append(table, scope); } -static void build_piix4_pci_hotplug(Aml *table) +static void build_x86_acpi_pci_hotplug(Aml *table, uint64_t pcihp_addr) { Aml *scope; Aml *field; @@ -1260,20 +1263,22 @@ static void build_piix4_pci_hotplug(Aml *table) scope = aml_scope("_SB.PCI0"); aml_append(scope, - aml_operation_region("PCST", AML_SYSTEM_IO, aml_int(0xae00), 0x08)); + aml_operation_region("PCST", AML_SYSTEM_IO, aml_int(pcihp_addr), 0x08)); field = aml_field("PCST", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS); aml_append(field, aml_named_field("PCIU", 32)); aml_append(field, aml_named_field("PCID", 32)); aml_append(scope, field); aml_append(scope, - aml_operation_region("SEJ", AML_SYSTEM_IO, aml_int(0xae08), 0x04)); + aml_operation_region("SEJ", AML_SYSTEM_IO, + aml_int(pcihp_addr + ACPI_PCIHP_SEJ_BASE), 0x04)); field = aml_field("SEJ", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS); aml_append(field, aml_named_field("B0EJ", 32)); aml_append(scope, field); aml_append(scope, - aml_operation_region("BNMR", AML_SYSTEM_IO, aml_int(0xae10), 0x08)); + aml_operation_region("BNMR", AML_SYSTEM_IO, + aml_int(pcihp_addr + ACPI_PCIHP_BNMR_BASE), 0x08)); field = aml_field("BNMR", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS); aml_append(field, aml_named_field("BNUM", 32)); aml_append(field, aml_named_field("PIDX", 32)); @@ -1407,7 +1412,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, build_piix4_isa_bridge(dsdt); build_isa_devices_aml(dsdt); if (pm->pcihp_bridge_en || pm->pcihp_root_en) { - build_piix4_pci_hotplug(dsdt); + build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base); } build_piix4_pci0_int(dsdt); } else { @@ -1455,6 +1460,9 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, } build_q35_isa_bridge(dsdt); build_isa_devices_aml(dsdt); + if (pm->pcihp_bridge_en) { + build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base); + } build_q35_pci0_int(dsdt); if (pcms->smbus && !pcmc->do_not_add_smb_acpi) { build_smb0(dsdt, pcms->smbus, ICH9_SMB_DEV, ICH9_SMB_FUNC); @@ -1489,7 +1497,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, { aml_append(scope, aml_name_decl("_HID", aml_string("ACPI0006"))); - if (misc->is_piix4 && (pm->pcihp_bridge_en || pm->pcihp_root_en)) { + if (pm->pcihp_bridge_en || pm->pcihp_root_en) { method = aml_method("_E01", 0, AML_NOTSERIALIZED); aml_append(method, aml_acquire(aml_name("\\_SB.PCI0.BLCK"), 0xFFFF)); diff --git a/hw/i386/acpi-build.h b/hw/i386/acpi-build.h index 74df5fc612..487ec7710f 100644 --- a/hw/i386/acpi-build.h +++ b/hw/i386/acpi-build.h @@ -5,6 +5,10 @@ extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio; +/* PCI Hot-plug registers bases. See docs/spec/acpi_pci_hotplug.txt */ +#define ACPI_PCIHP_SEJ_BASE 0x8 +#define ACPI_PCIHP_BNMR_BASE 0x10 + void acpi_setup(void); #endif diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h index df519e40b5..596120d97f 100644 --- a/include/hw/acpi/ich9.h +++ b/include/hw/acpi/ich9.h @@ -28,6 +28,8 @@ #include "hw/acpi/acpi_dev_interface.h" #include "hw/acpi/tco.h" +#define ACPI_PCIHP_ADDR_ICH9 0x0cc4 + typedef struct ICH9LPCPMRegs { /* * In ich9 spec says that pm1_cnt register is 32bit width and diff --git a/include/hw/acpi/pcihp.h b/include/hw/acpi/pcihp.h index 2dd90aea30..af1a169fc3 100644 --- a/include/hw/acpi/pcihp.h +++ b/include/hw/acpi/pcihp.h @@ -55,7 +55,8 @@ typedef struct AcpiPciHpState { } AcpiPciHpState; void acpi_pcihp_init(Object *owner, AcpiPciHpState *, PCIBus *root, - MemoryRegion *address_space_io, bool bridges_enabled); + MemoryRegion *address_space_io, bool bridges_enabled, + uint16_t io_base); void acpi_pcihp_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp); From c0e427d6eb5fefc5382ab01e78611740db1b5d06 Mon Sep 17 00:00:00 2001 From: Julia Suvorova Date: Tue, 13 Jul 2021 02:42:01 +0200 Subject: [PATCH 02/19] hw/acpi/ich9: Enable ACPI PCI hot-plug Add acpi_pcihp to ich9_pm as part of 'acpi-pci-hotplug-with-bridge-support' option. Set default to false. Signed-off-by: Julia Suvorova Signed-off-by: Marcel Apfelbaum Reviewed-by: Igor Mammedov Message-Id: <20210713004205.775386-3-jusual@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: David Gibson --- hw/acpi/acpi-x86-stub.c | 6 ++++ hw/acpi/ich9.c | 70 +++++++++++++++++++++++++++++++++++++++++ hw/acpi/pcihp.c | 12 +++++-- hw/i386/acpi-build.c | 14 ++++++--- hw/i386/acpi-build.h | 1 + include/hw/acpi/ich9.h | 3 ++ 6 files changed, 100 insertions(+), 6 deletions(-) diff --git a/hw/acpi/acpi-x86-stub.c b/hw/acpi/acpi-x86-stub.c index f88d6a090b..e9e46c5c5f 100644 --- a/hw/acpi/acpi-x86-stub.c +++ b/hw/acpi/acpi-x86-stub.c @@ -1,7 +1,13 @@ #include "qemu/osdep.h" #include "hw/i386/pc.h" +#include "hw/i386/acpi-build.h" void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid, const CPUArchIdList *apic_ids, GArray *entry) { } + +Object *acpi_get_i386_pci_host(void) +{ + return NULL; +} diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c index 4daa79ec8d..2f4eb453ac 100644 --- a/hw/acpi/ich9.c +++ b/hw/acpi/ich9.c @@ -217,6 +217,26 @@ static const VMStateDescription vmstate_cpuhp_state = { } }; +static bool vmstate_test_use_pcihp(void *opaque) +{ + ICH9LPCPMRegs *s = opaque; + + return s->use_acpi_hotplug_bridge; +} + +static const VMStateDescription vmstate_pcihp_state = { + .name = "ich9_pm/pcihp", + .version_id = 1, + .minimum_version_id = 1, + .needed = vmstate_test_use_pcihp, + .fields = (VMStateField[]) { + VMSTATE_PCI_HOTPLUG(acpi_pci_hotplug, + ICH9LPCPMRegs, + NULL, NULL), + VMSTATE_END_OF_LIST() + } +}; + const VMStateDescription vmstate_ich9_pm = { .name = "ich9_pm", .version_id = 1, @@ -238,6 +258,7 @@ const VMStateDescription vmstate_ich9_pm = { &vmstate_memhp_state, &vmstate_tco_io_state, &vmstate_cpuhp_state, + &vmstate_pcihp_state, NULL } }; @@ -259,6 +280,10 @@ static void pm_reset(void *opaque) } pm->smi_en_wmask = ~0; + if (pm->use_acpi_hotplug_bridge) { + acpi_pcihp_reset(&pm->acpi_pci_hotplug, true); + } + acpi_update_sci(&pm->acpi_regs, pm->irq); } @@ -297,6 +322,18 @@ void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm, pm->enable_tco = true; acpi_pm_tco_init(&pm->tco_regs, &pm->io); + if (pm->use_acpi_hotplug_bridge) { + acpi_pcihp_init(OBJECT(lpc_pci), + &pm->acpi_pci_hotplug, + pci_get_bus(lpc_pci), + pci_address_space_io(lpc_pci), + true, + ACPI_PCIHP_ADDR_ICH9); + + qbus_set_hotplug_handler(BUS(pci_get_bus(lpc_pci)), + OBJECT(lpc_pci)); + } + pm->irq = sci_irq; qemu_register_reset(pm_reset, pm); pm->powerdown_notifier.notify = pm_powerdown_req; @@ -368,6 +405,20 @@ static void ich9_pm_set_enable_tco(Object *obj, bool value, Error **errp) s->pm.enable_tco = value; } +static bool ich9_pm_get_acpi_pci_hotplug(Object *obj, Error **errp) +{ + ICH9LPCState *s = ICH9_LPC_DEVICE(obj); + + return s->pm.use_acpi_hotplug_bridge; +} + +static void ich9_pm_set_acpi_pci_hotplug(Object *obj, bool value, Error **errp) +{ + ICH9LPCState *s = ICH9_LPC_DEVICE(obj); + + s->pm.use_acpi_hotplug_bridge = value; +} + void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm) { static const uint32_t gpe0_len = ICH9_PMIO_GPE0_LEN; @@ -376,6 +427,7 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm) pm->disable_s3 = 0; pm->disable_s4 = 0; pm->s4_val = 2; + pm->use_acpi_hotplug_bridge = false; object_property_add_uint32_ptr(obj, ACPI_PM_PROP_PM_IO_BASE, &pm->pm_io_base, OBJ_PROP_FLAG_READ); @@ -399,6 +451,9 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm) object_property_add_bool(obj, ACPI_PM_PROP_TCO_ENABLED, ich9_pm_get_enable_tco, ich9_pm_set_enable_tco); + object_property_add_bool(obj, "acpi-pci-hotplug-with-bridge-support", + ich9_pm_get_acpi_pci_hotplug, + ich9_pm_set_acpi_pci_hotplug); } void ich9_pm_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, @@ -406,6 +461,11 @@ void ich9_pm_device_pre_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, { ICH9LPCState *lpc = ICH9_LPC_DEVICE(hotplug_dev); + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + acpi_pcihp_device_pre_plug_cb(hotplug_dev, dev, errp); + return; + } + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) && !lpc->pm.acpi_memory_hotplug.is_enabled) { error_setg(errp, @@ -441,6 +501,9 @@ void ich9_pm_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, } else { acpi_cpu_plug_cb(hotplug_dev, &lpc->pm.cpuhp_state, dev, errp); } + } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + acpi_pcihp_device_plug_cb(hotplug_dev, &lpc->pm.acpi_pci_hotplug, + dev, errp); } else { error_setg(errp, "acpi: device plug request for not supported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -473,6 +536,10 @@ void ich9_pm_device_unplug_request_cb(HotplugHandler *hotplug_dev, acpi_cpu_unplug_request_cb(hotplug_dev, &lpc->pm.cpuhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + acpi_pcihp_device_unplug_request_cb(hotplug_dev, + &lpc->pm.acpi_pci_hotplug, + dev, errp); } else { error_setg(errp, "acpi: device unplug request for not supported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -490,6 +557,9 @@ void ich9_pm_device_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU) && !lpc->pm.cpu_hotplug_legacy) { acpi_cpu_unplug_cb(&lpc->pm.cpuhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + acpi_pcihp_device_unplug_cb(hotplug_dev, &lpc->pm.acpi_pci_hotplug, + dev, errp); } else { error_setg(errp, "acpi: device unplug for not supported device" " type: %s", object_get_typename(OBJECT(dev))); diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index d98a284b7a..9fdc6342b0 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -30,6 +30,8 @@ #include "hw/pci-host/i440fx.h" #include "hw/pci/pci.h" #include "hw/pci/pci_bridge.h" +#include "hw/pci/pci_host.h" +#include "hw/i386/acpi-build.h" #include "hw/acpi/acpi.h" #include "hw/pci/pci_bus.h" #include "migration/vmstate.h" @@ -103,6 +105,7 @@ static void *acpi_set_bsel(PCIBus *bus, void *opaque) static void acpi_set_pci_info(void) { static bool bsel_is_set; + Object *host = acpi_get_i386_pci_host(); PCIBus *bus; unsigned bsel_alloc = ACPI_PCIHP_BSEL_DEFAULT; @@ -111,7 +114,11 @@ static void acpi_set_pci_info(void) } bsel_is_set = true; - bus = find_i440fx(); /* TODO: Q35 support */ + if (!host) { + return; + } + + bus = PCI_HOST_BRIDGE(host)->bus; if (bus) { /* Scan all PCI buses. Set property to enable acpi based hotplug. */ pci_for_each_bus_depth_first(bus, acpi_set_bsel, NULL, &bsel_alloc); @@ -121,13 +128,14 @@ static void acpi_set_pci_info(void) static void acpi_pcihp_disable_root_bus(void) { static bool root_hp_disabled; + Object *host = acpi_get_i386_pci_host(); PCIBus *bus; if (root_hp_disabled) { return; } - bus = find_i440fx(); + bus = PCI_HOST_BRIDGE(host)->bus; if (bus) { /* setting the hotplug handler to NULL makes the bus non-hotpluggable */ qbus_set_hotplug_handler(BUS(bus), NULL); diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index e1c246d6e8..bc966a4110 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -299,7 +299,7 @@ static void acpi_get_misc_info(AcpiMiscInfo *info) * Because of the PXB hosts we cannot simply query TYPE_PCI_HOST_BRIDGE. * On i386 arch we only have two pci hosts, so we can look only for them. */ -static Object *acpi_get_i386_pci_host(void) +Object *acpi_get_i386_pci_host(void) { PCIHostState *host; @@ -320,7 +320,10 @@ static void acpi_get_pci_holes(Range *hole, Range *hole64) Object *pci_host; pci_host = acpi_get_i386_pci_host(); - g_assert(pci_host); + + if (!pci_host) { + return; + } range_set_bounds1(hole, object_property_get_uint(pci_host, @@ -1765,6 +1768,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, PCIBus *bus = NULL; pci_host = acpi_get_i386_pci_host(); + if (pci_host) { bus = PCI_HOST_BRIDGE(pci_host)->bus; } @@ -2321,7 +2325,9 @@ static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) QObject *o; pci_host = acpi_get_i386_pci_host(); - g_assert(pci_host); + if (!pci_host) { + return false; + } o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_BASE, NULL); if (!o) { @@ -2351,7 +2357,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) AcpiPmInfo pm; AcpiMiscInfo misc; AcpiMcfgInfo mcfg; - Range pci_hole, pci_hole64; + Range pci_hole = {}, pci_hole64 = {}; uint8_t *u; size_t aml_len = 0; GArray *tables_blob = tables->table_data; diff --git a/hw/i386/acpi-build.h b/hw/i386/acpi-build.h index 487ec7710f..0dce155c8c 100644 --- a/hw/i386/acpi-build.h +++ b/hw/i386/acpi-build.h @@ -10,5 +10,6 @@ extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio; #define ACPI_PCIHP_BNMR_BASE 0x10 void acpi_setup(void); +Object *acpi_get_i386_pci_host(void); #endif diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h index 596120d97f..a329ce43ab 100644 --- a/include/hw/acpi/ich9.h +++ b/include/hw/acpi/ich9.h @@ -24,6 +24,7 @@ #include "hw/acpi/acpi.h" #include "hw/acpi/cpu_hotplug.h" #include "hw/acpi/cpu.h" +#include "hw/acpi/pcihp.h" #include "hw/acpi/memory_hotplug.h" #include "hw/acpi/acpi_dev_interface.h" #include "hw/acpi/tco.h" @@ -55,6 +56,8 @@ typedef struct ICH9LPCPMRegs { AcpiCpuHotplug gpe_cpu; CPUHotplugState cpuhp_state; + bool use_acpi_hotplug_bridge; + AcpiPciHpState acpi_pci_hotplug; MemHotplugState acpi_memory_hotplug; uint8_t disable_s3; From 3f3cbbb2369ebba67cccf8c60d6a0043b315e17c Mon Sep 17 00:00:00 2001 From: Julia Suvorova Date: Tue, 13 Jul 2021 02:42:02 +0200 Subject: [PATCH 03/19] hw/pci/pcie: Do not set HPC flag if acpihp is used Instead of changing the hot-plug type in _OSC register, do not set the 'Hot-Plug Capable' flag. This way guest will choose ACPI hot-plug if it is preferred and leave the option to use SHPC with pcie-pci-bridge. The ability to control hot-plug for each downstream port is retained, while 'hotplug=off' on the port means all hot-plug types are disabled. Signed-off-by: Julia Suvorova Reviewed-by: Igor Mammedov Reviewed-by: Marcel Apfelbaum Reviewed-by: David Gibson Message-Id: <20210713004205.775386-4-jusual@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/pcihp.c | 8 ++++++++ hw/core/machine.c | 1 - hw/i386/pc_q35.c | 11 +++++++++++ hw/pci/pcie.c | 8 +++++++- hw/pci/pcie_port.c | 1 + include/hw/pci/pcie_port.h | 5 ++++- 6 files changed, 31 insertions(+), 3 deletions(-) diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index 9fdc6342b0..f4d706e47d 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -31,6 +31,7 @@ #include "hw/pci/pci.h" #include "hw/pci/pci_bridge.h" #include "hw/pci/pci_host.h" +#include "hw/pci/pcie_port.h" #include "hw/i386/acpi-build.h" #include "hw/acpi/acpi.h" #include "hw/pci/pci_bus.h" @@ -336,6 +337,13 @@ void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { PCIBus *sec = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); + /* Remove all hot-plug handlers if hot-plug is disabled on slot */ + if (object_dynamic_cast(OBJECT(dev), TYPE_PCIE_SLOT) && + !PCIE_SLOT(pdev)->hotplug) { + qbus_set_hotplug_handler(BUS(sec), NULL); + return; + } + qbus_set_hotplug_handler(BUS(sec), OBJECT(hotplug_dev)); /* We don't have to overwrite any other hotplug handler yet */ assert(QLIST_EMPTY(&sec->child)); diff --git a/hw/core/machine.c b/hw/core/machine.c index 6f59fb0b7f..775add0795 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -584,7 +584,6 @@ static void machine_set_memdev(Object *obj, const char *value, Error **errp) ms->ram_memdev_id = g_strdup(value); } - static void machine_init_notify(Notifier *notifier, void *data) { MachineState *machine = MACHINE(qdev_get_machine()); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 46a0f196f4..04b4a4788d 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -37,6 +37,7 @@ #include "sysemu/kvm.h" #include "hw/kvm/clock.h" #include "hw/pci-host/q35.h" +#include "hw/pci/pcie_port.h" #include "hw/qdev-properties.h" #include "hw/i386/x86.h" #include "hw/i386/pc.h" @@ -136,6 +137,7 @@ static void pc_q35_init(MachineState *machine) ram_addr_t lowmem; DriveInfo *hd[MAX_SATA_PORTS]; MachineClass *mc = MACHINE_GET_CLASS(machine); + bool acpi_pcihp; /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping @@ -236,6 +238,15 @@ static void pc_q35_init(MachineState *machine) object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, OBJECT(lpc), &error_abort); + acpi_pcihp = object_property_get_bool(OBJECT(lpc), + "acpi-pci-hotplug-with-bridge-support", + NULL); + + if (acpi_pcihp) { + object_register_sugar_prop(TYPE_PCIE_SLOT, "native-hotplug", + "false", true); + } + /* irq lines */ gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled); diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index fd0fa157e8..6e95d82903 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -529,7 +529,13 @@ void pcie_cap_slot_init(PCIDevice *dev, PCIESlot *s) PCI_EXP_SLTCAP_PIP | PCI_EXP_SLTCAP_AIP | PCI_EXP_SLTCAP_ABP); - if (s->hotplug) { + + /* + * Enable native hot-plug on all hot-plugged bridges unless + * hot-plug is disabled on the slot. + */ + if (s->hotplug && + (s->native_hotplug || DEVICE(dev)->hotplugged)) { pci_long_test_and_set_mask(dev->config + pos + PCI_EXP_SLTCAP, PCI_EXP_SLTCAP_HPS | PCI_EXP_SLTCAP_HPC); diff --git a/hw/pci/pcie_port.c b/hw/pci/pcie_port.c index eb563ad435..da850e8dde 100644 --- a/hw/pci/pcie_port.c +++ b/hw/pci/pcie_port.c @@ -148,6 +148,7 @@ static Property pcie_slot_props[] = { DEFINE_PROP_UINT8("chassis", PCIESlot, chassis, 0), DEFINE_PROP_UINT16("slot", PCIESlot, slot, 0), DEFINE_PROP_BOOL("hotplug", PCIESlot, hotplug, true), + DEFINE_PROP_BOOL("native-hotplug", PCIESlot, native_hotplug, true), DEFINE_PROP_END_OF_LIST() }; diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h index bea8ecad0f..e25b289ce8 100644 --- a/include/hw/pci/pcie_port.h +++ b/include/hw/pci/pcie_port.h @@ -57,8 +57,11 @@ struct PCIESlot { /* Disable ACS (really for a pcie_root_port) */ bool disable_acs; - /* Indicates whether hot-plug is enabled on the slot */ + /* Indicates whether any type of hot-plug is allowed on the slot */ bool hotplug; + + bool native_hotplug; + QLIST_ENTRY(PCIESlot) next; }; From 383d11f217feca0f9947f790f2deafc09fe69ec5 Mon Sep 17 00:00:00 2001 From: Julia Suvorova Date: Tue, 13 Jul 2021 02:42:03 +0200 Subject: [PATCH 04/19] bios-tables-test: Allow changes in DSDT ACPI tables All DSDT Q35 tables will be modified because ACPI hot-plug is enabled by default. Signed-off-by: Julia Suvorova Reviewed-by: Igor Mammedov Reviewed-by: Marcel Apfelbaum Message-Id: <20210713004205.775386-5-jusual@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/qtest/bios-tables-test-allowed-diff.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..c5167f48af 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,12 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/q35/DSDT", +"tests/data/acpi/q35/DSDT.tis", +"tests/data/acpi/q35/DSDT.bridge", +"tests/data/acpi/q35/DSDT.mmio64", +"tests/data/acpi/q35/DSDT.ipmibt", +"tests/data/acpi/q35/DSDT.cphp", +"tests/data/acpi/q35/DSDT.memhp", +"tests/data/acpi/q35/DSDT.acpihmat", +"tests/data/acpi/q35/DSDT.numamem", +"tests/data/acpi/q35/DSDT.dimmpxm", +"tests/data/acpi/q35/DSDT.nohpet", From 17858a169508609ca9063c544833e5a1adeb7b52 Mon Sep 17 00:00:00 2001 From: Julia Suvorova Date: Tue, 13 Jul 2021 02:42:04 +0200 Subject: [PATCH 05/19] hw/acpi/ich9: Set ACPI PCI hot-plug as default on Q35 Q35 has three different types of PCI devices hot-plug: PCIe Native, SHPC Native and ACPI hot-plug. This patch changes the default choice for cold-plugged bridges from PCIe Native to ACPI Hot-plug with ability to use SHPC and PCIe Native for hot-plugged bridges. This is a list of the PCIe Native hot-plug issues that led to this change: * no racy behavior during boot (see 110c477c2ed) * no delay during deleting - after the actual power off software must wait at least 1 second before indicating about it. This case is quite important for users, it even has its own bug: https://bugzilla.redhat.com/show_bug.cgi?id=1594168 * no timer-based behavior - in addition to the previous example, the attention button has a 5-second waiting period, during which the operation can be canceled with a second press. While this looks fine for manual button control, automation will result in the need to queue or drop events, and the software receiving events in all sort of unspecified combinations of attention/power indicator states, which is racy and uppredictable. * fixes: * https://bugzilla.redhat.com/show_bug.cgi?id=1752465 * https://bugzilla.redhat.com/show_bug.cgi?id=1690256 To return to PCIe Native hot-plug: -global ICH9-LPC.acpi-pci-hotplug-with-bridge-support=off Known issue: older linux guests need the following flag to allow hotplugged pci express devices to use io: -device pcie-root-port,io-reserve=4096. io is unusual for pci express so this seems minor. We'll fix this by a follow up patch. Signed-off-by: Julia Suvorova Reviewed-by: Igor Mammedov Message-Id: <20210713004205.775386-6-jusual@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: David Gibson --- hw/acpi/ich9.c | 2 +- hw/i386/pc.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c index 2f4eb453ac..778e27b659 100644 --- a/hw/acpi/ich9.c +++ b/hw/acpi/ich9.c @@ -427,7 +427,7 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm) pm->disable_s3 = 0; pm->disable_s4 = 0; pm->s4_val = 2; - pm->use_acpi_hotplug_bridge = false; + pm->use_acpi_hotplug_bridge = true; object_property_add_uint32_ptr(obj, ACPI_PM_PROP_PM_IO_BASE, &pm->pm_io_base, OBJ_PROP_FLAG_READ); diff --git a/hw/i386/pc.c b/hw/i386/pc.c index aa79c5e0e6..f4c7a78362 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -99,6 +99,7 @@ GlobalProperty pc_compat_6_0[] = { { "qemu64" "-" TYPE_X86_CPU, "model", "6" }, { "qemu64" "-" TYPE_X86_CPU, "stepping", "3" }, { TYPE_X86_CPU, "x-vendor-cpuid-only", "off" }, + { "ICH9-LPC", "acpi-pci-hotplug-with-bridge-support", "off" }, }; const size_t pc_compat_6_0_len = G_N_ELEMENTS(pc_compat_6_0); From 1580b897c7b2f794fff7ba140ab757be1f6efadc Mon Sep 17 00:00:00 2001 From: Julia Suvorova Date: Tue, 13 Jul 2021 02:42:05 +0200 Subject: [PATCH 06/19] bios-tables-test: Update golden binaries Add ACPI hot-plug registers to DSDT Q35 tables. Changes in the tables: + Scope (_SB.PCI0) + { + OperationRegion (PCST, SystemIO, 0x0CC4, 0x08) + Field (PCST, DWordAcc, NoLock, WriteAsZeros) + { + PCIU, 32, + PCID, 32 + } + + OperationRegion (SEJ, SystemIO, 0x0CCC, 0x04) + Field (SEJ, DWordAcc, NoLock, WriteAsZeros) + { + B0EJ, 32 + } + + OperationRegion (BNMR, SystemIO, 0x0CD4, 0x08) + Field (BNMR, DWordAcc, NoLock, WriteAsZeros) + { + BNUM, 32, + PIDX, 32 + } + + Mutex (BLCK, 0x00) + Method (PCEJ, 2, NotSerialized) + { + Acquire (BLCK, 0xFFFF) + BNUM = Arg0 + B0EJ = (One << Arg1) + Release (BLCK) + Return (Zero) + } + + Method (AIDX, 2, NotSerialized) + { + Acquire (BLCK, 0xFFFF) + BNUM = Arg0 + PIDX = (One << Arg1) + Local0 = PIDX /* \_SB_.PCI0.PIDX */ + Release (BLCK) + Return (Local0) + } + + Method (PDSM, 6, Serialized) + { + If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling Interface */)) + { + Local0 = AIDX (Arg4, Arg5) + If ((Arg2 == Zero)) + { + If ((Arg1 == 0x02)) + { + If (!((Local0 == Zero) | (Local0 == 0xFFFFFFFF))) + { + Return (Buffer (One) + { + 0x81 // . + }) + } + } + + Return (Buffer (One) + { + 0x00 // . + }) + } + ElseIf ((Arg2 == 0x07)) + { + Local1 = Package (0x02) + { + Zero, + "" + } + Local1 [Zero] = Local0 + Return (Local1) + } + } + } + } + ... Scope (_GPE) { Name (_HID, "ACPI0006" /* GPE Block Device */) // _HID: Hardware ID + Method (_E01, 0, NotSerialized) // _Exx: Edge-Triggered GPE, xx=0x00-0xFF + { + Acquire (\_SB.PCI0.BLCK, 0xFFFF) + \_SB.PCI0.PCNT () + Release (\_SB.PCI0.BLCK) + } ... + + Device (PHPR) + { + Name (_HID, "PNP0A06" /* Generic Container Device */) // _HID: Hardware ID + Name (_UID, "PCI Hotplug resources") // _UID: Unique ID + Name (_STA, 0x0B) // _STA: Status + Name (_CRS, ResourceTemplate () // _CRS: Current Resource Settings + { + IO (Decode16, + 0x0CC4, // Range Minimum + 0x0CC4, // Range Maximum + 0x01, // Alignment + 0x18, // Length + ) + }) + } } ... And if there is a port in configuration: Device (S10) { Name (_ADR, 0x00020000) // _ADR: Address + Name (BSEL, Zero) + Device (S00) + { + Name (_SUN, Zero) // _SUN: Slot User Number + Name (_ADR, Zero) // _ADR: Address + Method (_EJ0, 1, NotSerialized) // _EJx: Eject Device, x=0-9 + { + PCEJ (BSEL, _SUN) + } + + Method (_DSM, 4, Serialized) // _DSM: Device-Specific Method + { + Return (PDSM (Arg0, Arg1, Arg2, Arg3, BSEL, _SUN)) + } + } + ... + Method (DVNT, 2, NotSerialized) + { + If ((Arg0 & One)) + { + Notify (S00, Arg1) + } ... Signed-off-by: Julia Suvorova Message-Id: <20210713004205.775386-7-jusual@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/data/acpi/q35/DSDT | Bin 7859 -> 8289 bytes tests/data/acpi/q35/DSDT.acpihmat | Bin 9184 -> 9614 bytes tests/data/acpi/q35/DSDT.bridge | Bin 7877 -> 11003 bytes tests/data/acpi/q35/DSDT.cphp | Bin 8323 -> 8753 bytes tests/data/acpi/q35/DSDT.dimmpxm | Bin 9513 -> 9943 bytes tests/data/acpi/q35/DSDT.ipmibt | Bin 7934 -> 8364 bytes tests/data/acpi/q35/DSDT.memhp | Bin 9218 -> 9648 bytes tests/data/acpi/q35/DSDT.mmio64 | Bin 8990 -> 9419 bytes tests/data/acpi/q35/DSDT.nohpet | Bin 7717 -> 8147 bytes tests/data/acpi/q35/DSDT.numamem | Bin 7865 -> 8295 bytes tests/data/acpi/q35/DSDT.tis | Bin 8465 -> 8894 bytes tests/qtest/bios-tables-test-allowed-diff.h | 11 ----------- 12 files changed, 11 deletions(-) diff --git a/tests/data/acpi/q35/DSDT b/tests/data/acpi/q35/DSDT index cccf92f0466fa4eaf2e9e06675b3b102c7a8eb86..842533f53e6db40935c3cdecd1d182edba6c17d4 100644 GIT binary patch delta 466 zcmdmN`_O^QCDauXIPPtIak zAUQdci)n%Iq}s^|43iUh{sRHS5=B8~#>OQ;f?=Wf0@2A?TpzYIYyru7HSX=AqfOH`-McSBME?12>6O_{w~AM zs2<(q#S-8V5X2Gh;pxH~;1^)vXkf;`5g!WTIeRL2LFaQ7^vWU|F delta 57 zcmaFpu-TT&CD*D+yKS90tV5j&1XHSFZh5+Z_5Jv7JJX{>njRGK! zGf+HK0Z6zgL^lMxdc`wxpW)$RiEiWuah#nDT)lu2PJX^YjNDfs+CUsOVt(t-l7wM diff --git a/tests/data/acpi/q35/DSDT.bridge b/tests/data/acpi/q35/DSDT.bridge index eb5d27d95b2cdeda5f7e1f6b151cfea02e6bd907..55ad4bd7ab4759d68031b2f1d0307355a7332fe7 100644 GIT binary patch delta 3198 zcmb7{PjBNy7{Ebzd(?3;ItCjk3cyfap1s>(;m2R;mDzf?)%1mvr$RXI0q}wJmcRyGxq%P zuaBe_71=%4P#92N&%?#cyB()e z^@POlsv^1P2^2=TP^p4ZwrWR0;?LkV6h_&q*Yfs(`BB~#%7@1LOn==rDpk>4M|FMO zZeor0*IZ~Zgt!}6-edZkCAjH-)u0x_+IVS%bsM@lW1CKGzIzz=o2j&-{ju}g-_F)I zweK_k)Uv-AwKlSzeAei$`=_j%^;?SAP5%-q`jNitRSFN7yZ-pC7>!!cp`ilXu#Vl0p#rR- z)s~+Bx3VB?xEEE)-nWlrZ~wqd+Lm1?6}Fk|)lnG-`TftoXg~S#6aPgnH*=|8s=~4_ zt)9+3dyMUAd(Zr@R^R2;PtJpaCwnFHNM%f9a@naIGWX&gr+~#y-2x(*OkeaW)q;Ry zk5$moGaihZ+QYGHwoX5#a`&PnIfY_yyFhGV6oQXul%*gUG^B_G5l2BLGr}B7j2N*Y z2A`rc5ajJ}EEI_#(hy`y1_g-|BhGXlWmF-^CIy)x$V}uQn-pYL_IU8zXCp%%QV^Mf z$k7=^kN00OWsV?oks(_YL?MO}fx#{J0)};qh9p6dL4h2~t$ii62I7X3zXvENNGWg_m1cDSPNQxk-n~+-> zyA))R7>gl;Pr0B$kX;IrCP+GRkP-#aLk4GhbO;4mq999=8RaktSMCtxPIL$b$q*wG zmBAf@hGdx_%TXbYK|!*_$WF=N{f(@@2ACA&E?lZtPL|bof*dM;kuPGKeUd;ZsM#d zu%~z`X?4}JrXr#!hC%`X$C$v8QIMm7fZaQA@+ipJ1F^};>Yzd^dDcuYYzcO_P+?OK z2OhQsn@;FqW3YRK3R{DHAXL~KEa;)a_F!oa6}AU!EN^ue&YE!Y5*QS~f=OEJYrHw| zCDgEvk(T@Vm9L?O<%RrwU%&GAP{XVzzth)6J}Bt-873oDfV1FJoLB>m{f0R*(AaZx z$j83Jt2^-XsE7PL(C|hL{0?dyR(%c6xvb6bAde3lzZ8VttC4dw%|4LxXsZct@8Y9tH*`1^^Ot5>o&G diff --git a/tests/data/acpi/q35/DSDT.cphp b/tests/data/acpi/q35/DSDT.cphp index e55d12990c98d8eed760b858ce18a036b612da1c..ccde2add9f87db0c0eaf0cc155717d5744e4ef40 100644 GIT binary patch delta 466 zcmZp6+~~sP66_LUsKmg)=(&;WH&eZUpP*iRuv2`1v!_9HLx6K|2qX6q9xjgPMgb7V z87LmA03=)#q8ox;z2X_U&+u@uL^pDSIL=N6u3kV1CqLgHM(!&RZ6J=blV7N>LV%}B zghDi|3JX7L{X5Lv2h8IU|1-=Ky-2z7khyaD-#1RLt;TfM$Fl2~Uj?rg^g&M>|NCE-Qej(B7NCF@g0=}Y~kIV5h zsz*0@u>^Po1aZWBc)IWg_yrg^8kjL~#D{`-&YlV$`6UH8rRkG5%FD7H;W@&?C^7kp Zd?uG4OAN?D0XLS(ZVI;SB5WXA7y#g;hnxTa delta 57 zcmdn!((K6P66_MvtiZs)7_^b=H`8VnmetaXB9kx53o{Ei3T{?Z;Afm%qnOU=#u5`B N>=Z9xH~FoiEdbPr5E}ph diff --git a/tests/data/acpi/q35/DSDT.dimmpxm b/tests/data/acpi/q35/DSDT.dimmpxm index 95901f94c0d445919cb84dd1f6d98c646ae8176e..b062e30117f955c7a2ac9629a0512324bbd28bf2 100644 GIT binary patch delta 500 zcmZ4Kb={ZCCDauXIPPtIak zAUQdci)n%Iq}s^|43iUh{sRHS5=B8~#>OQ;f?=Wf0@2A?T6g=`v3UW%*Cx@%ZvK`?$!ow&rxk@FI%a0`nG&EEVc?F8u9J{t{m|mo-VwO&H~3^jZa^I-tdz5?tSm&`@XmD^P=GxwR;r+n0^pBTugx@MNxhi+R{7=Is)ae{Qa zZmGCv?5hOoWwZyBMU2c+h-5h%4*eE)_@@`J4R{F2alH?er92z@XJalN=5z3Vy`Ex3 zu;EyfL3x6oj4YpLbq~EDOFJM))3z82Qn%;S1Jsj-?1BD5 zG7Nit2HQ7IEKms&<@45*_wY1#%*wp>$N z10>s64gcM6wBDG2kzFIXF@ZZ|Yvxg(pMp7ZRT6LkS4YX%##){Q$J{#WVOLiRN5_8m iauXIPPtIak zAUQdci)n%Iq}s^|43iUh{sRHS5=B8~#>OQ;f?=Wf0@2A?T<+5bHs-N?E!|mf=7NyK~8DaL66_Mfq{6_!cz+|;Z>G&EEL!s7BJu74t{m|mo-VwO&H4 b7bx>HPX4Wu&gsSy6CdmpFJL!0N7WVpu^|&B diff --git a/tests/data/acpi/q35/DSDT.mmio64 b/tests/data/acpi/q35/DSDT.mmio64 index 77d46369e48efca9a9e5024542c77cd26144beff..2e0a772a85275c9c3b4c9317d98cc7c3d27417f3 100644 GIT binary patch delta 479 zcmZWl%}T>S7~Ji@Y1C9x)PoXRr3XPH_=Db>O|+>dU78kztt36!iee6WP!L;BUUlyh zp)Vjlg1v}>9()BazJiCkOBDq7Ffg<8&G3D<8_%YoJ9_1L1^`I!g|E&)Y6Qrp|k$q^kRffs;{un1nv#XYZ zO?^ipP}@dZK%T|OD27OuvtG|{aEE_-0h@q_kQ~=LK%UF9p?@~!;$c1q@5k%OdJOB1 zH56<|X(Kf%NN=l8AI|cbbv^cJO*|sY=UE4bUXZ2xAef*{Ary?>oKp`_PwcS=I@75z zY89Gf~gPY5jIs-T>1(DGmt(vHd;|L@ije>S delta 71 zcmX@@InRyDCDauXIPPtIak zAUQdci)n%Iq}s^|43iUh{sRHS5=B8~#>OQ;f?=Wf0@2A?TqMI{i_!+&To4i;8JOY9sCI|Qh7&rnA=7D+yKS90tV5j&1XHSFZh5+Z_5Jv7JJX{>njRGK! zGf+HK0Z6zgL^lMxdc`wxpW)$RiEiWuah#nDT)lu2PJX^YjNDfs+CUsylgPr07oIMSq8v>kzLm0V_@NjWNHwu6_ z&Oq@{1t8&~5Zw^$>J`t(eTIjNCAyIt#Bp{qaP+slXajMao%}+56#_h6 zA{3$-oqU|V8AMbAoL#+`qLo3s|NjdzKq?9<>lrf{K!!xCgT$9Gh-f>7t;daNwt#`7$zt1{09PtC5nQ~jEzfx1j9n{1)`I)xY!GXSeY1j84?Q;IuaQe zmLzUwWxg%RBR&+wbM{p5$S*0#DNUdJR#ukn2+t87Mv2J+ da+#d|nlT^?1>7e$$=R`quz@Vuyk9Pe2>{`!ixL0; delta 62 zcmdnzI?;*CCD Date: Fri, 9 Jul 2021 10:30:16 +0530 Subject: [PATCH 07/19] hw/virtio: add boilerplate for vhost-user-i2c device This creates the QEMU side of the vhost-user-i2c device which connects to the remote daemon. It is based of vhost-user-fs code. Signed-off-by: Viresh Kumar Message-Id: Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/Kconfig | 5 + hw/virtio/meson.build | 1 + hw/virtio/vhost-user-i2c.c | 288 +++++++++++++++++++++++++++++ include/hw/virtio/vhost-user-i2c.h | 28 +++ 4 files changed, 322 insertions(+) create mode 100644 hw/virtio/vhost-user-i2c.c create mode 100644 include/hw/virtio/vhost-user-i2c.h diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig index 0eda25c4e1..35ab45e209 100644 --- a/hw/virtio/Kconfig +++ b/hw/virtio/Kconfig @@ -58,3 +58,8 @@ config VIRTIO_MEM depends on LINUX depends on VIRTIO_MEM_SUPPORTED select MEM_DEVICE + +config VHOST_USER_I2C + bool + default y + depends on VIRTIO && VHOST_USER diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index fbff9bc9d4..1a0d736a0d 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -25,6 +25,7 @@ virtio_ss.add(when: 'CONFIG_VHOST_USER_VSOCK', if_true: files('vhost-user-vsock. virtio_ss.add(when: 'CONFIG_VIRTIO_RNG', if_true: files('virtio-rng.c')) virtio_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu.c')) virtio_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-mem.c')) +virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c.c')) virtio_pci_ss = ss.source_set() virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-pci.c')) diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c new file mode 100644 index 0000000000..d172632bb0 --- /dev/null +++ b/hw/virtio/vhost-user-i2c.c @@ -0,0 +1,288 @@ +/* + * Vhost-user i2c virtio device + * + * Copyright (c) 2021 Viresh Kumar + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/vhost-user-i2c.h" +#include "qemu/error-report.h" +#include "standard-headers/linux/virtio_ids.h" + +/* Remove this once the header is updated in Linux kernel */ +#ifndef VIRTIO_ID_I2C_ADAPTER +#define VIRTIO_ID_I2C_ADAPTER 34 +#endif + +static void vu_i2c_start(VirtIODevice *vdev) +{ + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + int ret, i; + + if (!k->set_guest_notifiers) { + error_report("binding does not support guest notifiers"); + return; + } + + ret = vhost_dev_enable_notifiers(&i2c->vhost_dev, vdev); + if (ret < 0) { + error_report("Error enabling host notifiers: %d", -ret); + return; + } + + ret = k->set_guest_notifiers(qbus->parent, i2c->vhost_dev.nvqs, true); + if (ret < 0) { + error_report("Error binding guest notifier: %d", -ret); + goto err_host_notifiers; + } + + i2c->vhost_dev.acked_features = vdev->guest_features; + + ret = vhost_dev_start(&i2c->vhost_dev, vdev); + if (ret < 0) { + error_report("Error starting vhost-user-i2c: %d", -ret); + goto err_guest_notifiers; + } + + /* + * guest_notifier_mask/pending not used yet, so just unmask + * everything here. virtio-pci will do the right thing by + * enabling/disabling irqfd. + */ + for (i = 0; i < i2c->vhost_dev.nvqs; i++) { + vhost_virtqueue_mask(&i2c->vhost_dev, vdev, i, false); + } + + return; + +err_guest_notifiers: + k->set_guest_notifiers(qbus->parent, i2c->vhost_dev.nvqs, false); +err_host_notifiers: + vhost_dev_disable_notifiers(&i2c->vhost_dev, vdev); +} + +static void vu_i2c_stop(VirtIODevice *vdev) +{ + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + + if (!k->set_guest_notifiers) { + return; + } + + vhost_dev_stop(&i2c->vhost_dev, vdev); + + ret = k->set_guest_notifiers(qbus->parent, i2c->vhost_dev.nvqs, false); + if (ret < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return; + } + + vhost_dev_disable_notifiers(&i2c->vhost_dev, vdev); +} + +static void vu_i2c_set_status(VirtIODevice *vdev, uint8_t status) +{ + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK; + + if (!vdev->vm_running) { + should_start = false; + } + + if (i2c->vhost_dev.started == should_start) { + return; + } + + if (should_start) { + vu_i2c_start(vdev); + } else { + vu_i2c_stop(vdev); + } +} + +static uint64_t vu_i2c_get_features(VirtIODevice *vdev, + uint64_t requested_features, Error **errp) +{ + /* No feature bits used yet */ + return requested_features; +} + +static void vu_i2c_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + /* + * Not normally called; it's the daemon that handles the queue; + * however virtio's cleanup path can call this. + */ +} + +static void vu_i2c_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask) +{ + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + vhost_virtqueue_mask(&i2c->vhost_dev, vdev, idx, mask); +} + +static bool vu_i2c_guest_notifier_pending(VirtIODevice *vdev, int idx) +{ + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + return vhost_virtqueue_pending(&i2c->vhost_dev, idx); +} + +static void do_vhost_user_cleanup(VirtIODevice *vdev, VHostUserI2C *i2c) +{ + vhost_user_cleanup(&i2c->vhost_user); + virtio_delete_queue(i2c->vq); + virtio_cleanup(vdev); + g_free(i2c->vhost_dev.vqs); + i2c->vhost_dev.vqs = NULL; +} + +static int vu_i2c_connect(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + if (i2c->connected) { + return 0; + } + i2c->connected = true; + + /* restore vhost state */ + if (virtio_device_started(vdev, vdev->status)) { + vu_i2c_start(vdev); + } + + return 0; +} + +static void vu_i2c_disconnect(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + if (!i2c->connected) { + return; + } + i2c->connected = false; + + if (i2c->vhost_dev.started) { + vu_i2c_stop(vdev); + } +} + +static void vu_i2c_event(void *opaque, QEMUChrEvent event) +{ + DeviceState *dev = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + switch (event) { + case CHR_EVENT_OPENED: + if (vu_i2c_connect(dev) < 0) { + qemu_chr_fe_disconnect(&i2c->chardev); + return; + } + break; + case CHR_EVENT_CLOSED: + vu_i2c_disconnect(dev); + break; + case CHR_EVENT_BREAK: + case CHR_EVENT_MUX_IN: + case CHR_EVENT_MUX_OUT: + /* Ignore */ + break; + } +} + +static void vu_i2c_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(dev); + int ret; + + if (!i2c->chardev.chr) { + error_setg(errp, "vhost-user-i2c: missing chardev"); + return; + } + + if (!vhost_user_init(&i2c->vhost_user, &i2c->chardev, errp)) { + return; + } + + virtio_init(vdev, "vhost-user-i2c", VIRTIO_ID_I2C_ADAPTER, 0); + + i2c->vhost_dev.nvqs = 1; + i2c->vq = virtio_add_queue(vdev, 4, vu_i2c_handle_output); + i2c->vhost_dev.vqs = g_new0(struct vhost_virtqueue, i2c->vhost_dev.nvqs); + + ret = vhost_dev_init(&i2c->vhost_dev, &i2c->vhost_user, + VHOST_BACKEND_TYPE_USER, 0, errp); + if (ret < 0) { + do_vhost_user_cleanup(vdev, i2c); + } + + qemu_chr_fe_set_handlers(&i2c->chardev, NULL, NULL, vu_i2c_event, NULL, + dev, NULL, true); +} + +static void vu_i2c_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(dev); + + /* This will stop vhost backend if appropriate. */ + vu_i2c_set_status(vdev, 0); + vhost_dev_cleanup(&i2c->vhost_dev); + do_vhost_user_cleanup(vdev, i2c); +} + +static const VMStateDescription vu_i2c_vmstate = { + .name = "vhost-user-i2c", + .unmigratable = 1, +}; + +static Property vu_i2c_properties[] = { + DEFINE_PROP_CHR("chardev", VHostUserI2C, chardev), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vu_i2c_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vu_i2c_properties); + dc->vmsd = &vu_i2c_vmstate; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + vdc->realize = vu_i2c_device_realize; + vdc->unrealize = vu_i2c_device_unrealize; + vdc->get_features = vu_i2c_get_features; + vdc->set_status = vu_i2c_set_status; + vdc->guest_notifier_mask = vu_i2c_guest_notifier_mask; + vdc->guest_notifier_pending = vu_i2c_guest_notifier_pending; +} + +static const TypeInfo vu_i2c_info = { + .name = TYPE_VHOST_USER_I2C, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VHostUserI2C), + .class_init = vu_i2c_class_init, +}; + +static void vu_i2c_register_types(void) +{ + type_register_static(&vu_i2c_info); +} + +type_init(vu_i2c_register_types) diff --git a/include/hw/virtio/vhost-user-i2c.h b/include/hw/virtio/vhost-user-i2c.h new file mode 100644 index 0000000000..deae47a76d --- /dev/null +++ b/include/hw/virtio/vhost-user-i2c.h @@ -0,0 +1,28 @@ +/* + * Vhost-user i2c virtio device + * + * Copyright (c) 2021 Viresh Kumar + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef _QEMU_VHOST_USER_I2C_H +#define _QEMU_VHOST_USER_I2C_H + +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-user.h" + +#define TYPE_VHOST_USER_I2C "vhost-user-i2c-device" +OBJECT_DECLARE_SIMPLE_TYPE(VHostUserI2C, VHOST_USER_I2C) + +struct VHostUserI2C { + VirtIODevice parent; + CharBackend chardev; + struct vhost_virtqueue *vhost_vq; + struct vhost_dev vhost_dev; + VhostUserState vhost_user; + VirtQueue *vq; + bool connected; +}; + +#endif /* _QEMU_VHOST_USER_I2C_H */ From 538bb6f1219c1711538d73143f005e62c3d20f7b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 9 Jul 2021 10:30:17 +0530 Subject: [PATCH 08/19] hw/virtio: add vhost-user-i2c-pci boilerplate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows is to instantiate a vhost-user-i2c device as part of a PCI bus. It is mostly boilerplate which looks pretty similar to the vhost-user-fs-pci device. Reviewed-by: Alex Bennée Signed-off-by: Viresh Kumar Message-Id: <8a083eaa57d93feaab12acd1f94b225879212f20.1625806763.git.viresh.kumar@linaro.org> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/meson.build | 1 + hw/virtio/vhost-user-i2c-pci.c | 69 ++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 hw/virtio/vhost-user-i2c-pci.c diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build index 1a0d736a0d..bc352a6009 100644 --- a/hw/virtio/meson.build +++ b/hw/virtio/meson.build @@ -26,6 +26,7 @@ virtio_ss.add(when: 'CONFIG_VIRTIO_RNG', if_true: files('virtio-rng.c')) virtio_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu.c')) virtio_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-mem.c')) virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c.c')) +virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_I2C'], if_true: files('vhost-user-i2c-pci.c')) virtio_pci_ss = ss.source_set() virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-pci.c')) diff --git a/hw/virtio/vhost-user-i2c-pci.c b/hw/virtio/vhost-user-i2c-pci.c new file mode 100644 index 0000000000..70b7b65fd9 --- /dev/null +++ b/hw/virtio/vhost-user-i2c-pci.c @@ -0,0 +1,69 @@ +/* + * Vhost-user i2c virtio device PCI glue + * + * Copyright (c) 2021 Viresh Kumar + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-user-i2c.h" +#include "virtio-pci.h" + +struct VHostUserI2CPCI { + VirtIOPCIProxy parent_obj; + VHostUserI2C vdev; +}; + +typedef struct VHostUserI2CPCI VHostUserI2CPCI; + +#define TYPE_VHOST_USER_I2C_PCI "vhost-user-i2c-pci-base" + +DECLARE_INSTANCE_CHECKER(VHostUserI2CPCI, VHOST_USER_I2C_PCI, + TYPE_VHOST_USER_I2C_PCI) + +static void vhost_user_i2c_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostUserI2CPCI *dev = VHOST_USER_I2C_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + vpci_dev->nvectors = 1; + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_user_i2c_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_user_i2c_pci_realize; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */ + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER; +} + +static void vhost_user_i2c_pci_instance_init(Object *obj) +{ + VHostUserI2CPCI *dev = VHOST_USER_I2C_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_USER_I2C); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_i2c_pci_info = { + .base_name = TYPE_VHOST_USER_I2C_PCI, + .non_transitional_name = "vhost-user-i2c-pci", + .instance_size = sizeof(VHostUserI2CPCI), + .instance_init = vhost_user_i2c_pci_instance_init, + .class_init = vhost_user_i2c_pci_class_init, +}; + +static void vhost_user_i2c_pci_register(void) +{ + virtio_pci_types_register(&vhost_user_i2c_pci_info); +} + +type_init(vhost_user_i2c_pci_register); From 7395b3e3e70031b1efff7941cbef6a1ceb6f2ffd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 7 Jul 2021 11:41:14 -0400 Subject: [PATCH 09/19] docs: Add '-device intel-iommu' entry The parameters of intel-iommu device are non-trivial to understand. Add an entry for it so that people can reference to it when using. There're actually a few more options there, but I hide them explicitly because they shouldn't be used by normal QEMU users. Cc: Chao Yang Cc: Lei Yang Cc: Jing Zhao Cc: Jason Wang Cc: Michael S. Tsirkin Cc: Alex Williamson Reviewed-by: Jason Wang Reviewed-by: Yi Liu Signed-off-by: Peter Xu Message-Id: <20210707154114.197580-1-peterx@redhat.com> Reviewed-by: Maxim Levitsky Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- qemu-options.hx | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/qemu-options.hx b/qemu-options.hx index 214c477dcc..0c9ddc0274 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -939,6 +939,39 @@ SRST ``-device pci-ipmi-bt,bmc=id`` Like the KCS interface, but defines a BT interface on the PCI bus. + +``-device intel-iommu[,option=...]`` + This is only supported by ``-machine q35``, which will enable Intel VT-d + emulation within the guest. It supports below options: + + ``intremap=on|off`` (default: auto) + This enables interrupt remapping feature. It's required to enable + complete x2apic. Currently it only supports kvm kernel-irqchip modes + ``off`` or ``split``, while full kernel-irqchip is not yet supported. + The default value is "auto", which will be decided by the mode of + kernel-irqchip. + + ``caching-mode=on|off`` (default: off) + This enables caching mode for the VT-d emulated device. When + caching-mode is enabled, each guest DMA buffer mapping will generate an + IOTLB invalidation from the guest IOMMU driver to the vIOMMU device in + a synchronous way. It is required for ``-device vfio-pci`` to work + with the VT-d device, because host assigned devices requires to setup + the DMA mapping on the host before guest DMA starts. + + ``device-iotlb=on|off`` (default: off) + This enables device-iotlb capability for the emulated VT-d device. So + far virtio/vhost should be the only real user for this parameter, + paired with ats=on configured for the device. + + ``aw-bits=39|48`` (default: 39) + This decides the address width of IOVA address space. The address + space has 39 bits width for 3-level IOMMU page tables, and 48 bits for + 4-level IOMMU page tables. + + Please also refer to the wiki page for general scenarios of VT-d + emulation in QEMU: https://wiki.qemu.org/Features/VT-d. + ERST DEF("name", HAS_ARG, QEMU_OPTION_name, From 2d64b7bbb2a2e945635633486ef9a060cb2c89bc Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Thu, 8 Jul 2021 12:55:11 +0000 Subject: [PATCH 10/19] hw/pci/pci_host: Allow PCI host to bypass iommu Add a new bypass_iommu property for PCI host and use it to check whether devices attached to the PCI root bus will bypass iommu. In pci_device_iommu_address_space(), check the property and avoid getting iommu address space for devices bypass iommu. Signed-off-by: Xingang Wang Reviewed-by: Eric Auger Message-Id: <1625748919-52456-2-git-send-email-wangxingang5@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci.c | 18 +++++++++++++++++- hw/pci/pci_host.c | 1 + include/hw/pci/pci.h | 1 + include/hw/pci/pci_host.h | 1 + 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 377084f1a8..27d588e268 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -416,6 +416,22 @@ const char *pci_root_bus_path(PCIDevice *dev) return rootbus->qbus.name; } +bool pci_bus_bypass_iommu(PCIBus *bus) +{ + PCIBus *rootbus = bus; + PCIHostState *host_bridge; + + if (!pci_bus_is_root(bus)) { + rootbus = pci_device_root_bus(bus->parent_dev); + } + + host_bridge = PCI_HOST_BRIDGE(rootbus->qbus.parent); + + assert(host_bridge->bus == rootbus); + + return host_bridge->bypass_iommu; +} + static void pci_root_bus_init(PCIBus *bus, DeviceState *parent, MemoryRegion *address_space_mem, MemoryRegion *address_space_io, @@ -2718,7 +2734,7 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) iommu_bus = parent_bus; } - if (iommu_bus && iommu_bus->iommu_fn) { + if (!pci_bus_bypass_iommu(bus) && iommu_bus && iommu_bus->iommu_fn) { return iommu_bus->iommu_fn(bus, iommu_bus->iommu_opaque, devfn); } return &address_space_memory; diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c index 8ca5fadcbd..cf02f0d6a5 100644 --- a/hw/pci/pci_host.c +++ b/hw/pci/pci_host.c @@ -222,6 +222,7 @@ const VMStateDescription vmstate_pcihost = { static Property pci_host_properties_common[] = { DEFINE_PROP_BOOL("x-config-reg-migration-enabled", PCIHostState, mig_enabled, true), + DEFINE_PROP_BOOL("bypass-iommu", PCIHostState, bypass_iommu, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 6be4e0c460..f4d51b672b 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -480,6 +480,7 @@ void pci_for_each_bus(PCIBus *bus, PCIBus *pci_device_root_bus(const PCIDevice *d); const char *pci_root_bus_path(PCIDevice *dev); +bool pci_bus_bypass_iommu(PCIBus *bus); PCIDevice *pci_find_device(PCIBus *bus, int bus_num, uint8_t devfn); int pci_qdev_find_device(const char *id, PCIDevice **pdev); void pci_bus_get_w64_range(PCIBus *bus, Range *range); diff --git a/include/hw/pci/pci_host.h b/include/hw/pci/pci_host.h index 52e038c019..c6f4eb4585 100644 --- a/include/hw/pci/pci_host.h +++ b/include/hw/pci/pci_host.h @@ -43,6 +43,7 @@ struct PCIHostState { uint32_t config_reg; bool mig_enabled; PCIBus *bus; + bool bypass_iommu; QLIST_ENTRY(PCIHostState) next; }; From 91528f40ba3dda110927f11b47620bb24f8d601b Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Thu, 8 Jul 2021 12:55:12 +0000 Subject: [PATCH 11/19] hw/pxb: Add a bypass iommu property Add a bypass_iommu property for pci_expander_bridge, the property is used to indicate whether pxb root bus will bypass iommu. By default the bypass_iommu is disabled, and it can be enabled with: qemu -device pxb-pcie,bus_nr=0x10,addr=0x1,bypass_iommu=true Signed-off-by: Xingang Wang Reviewed-by: Eric Auger Message-Id: <1625748919-52456-3-git-send-email-wangxingang5@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci-bridge/pci_expander_bridge.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index aedded1064..7112dc3062 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -57,6 +57,7 @@ struct PXBDev { uint8_t bus_nr; uint16_t numa_node; + bool bypass_iommu; }; static PXBDev *convert_to_pxb(PCIDevice *dev) @@ -255,6 +256,7 @@ static void pxb_dev_realize_common(PCIDevice *dev, bool pcie, Error **errp) bus->map_irq = pxb_map_irq_fn; PCI_HOST_BRIDGE(ds)->bus = bus; + PCI_HOST_BRIDGE(ds)->bypass_iommu = pxb->bypass_iommu; pxb_register_bus(dev, bus, &local_err); if (local_err) { @@ -301,6 +303,7 @@ static Property pxb_dev_properties[] = { /* Note: 0 is not a legal PXB bus number. */ DEFINE_PROP_UINT8("bus_nr", PXBDev, bus_nr, 0), DEFINE_PROP_UINT16("numa_node", PXBDev, numa_node, NUMA_NODE_UNASSIGNED), + DEFINE_PROP_BOOL("bypass_iommu", PXBDev, bypass_iommu, false), DEFINE_PROP_END_OF_LIST(), }; From 6d7a85483a069b99ad4b018509643437b8f9a4f7 Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Thu, 8 Jul 2021 12:55:13 +0000 Subject: [PATCH 12/19] hw/arm/virt: Add default_bus_bypass_iommu machine option Add a default_bus_bypass_iommu machine option to enable/disable bypass_iommu for default root bus. The option is disabled by default and can be enabled with: $QEMU -machine virt,iommu=smmuv3,default_bus_bypass_iommu=true Signed-off-by: Xingang Wang Message-Id: <1625748919-52456-4-git-send-email-wangxingang5@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/arm/virt.c | 26 ++++++++++++++++++++++++++ include/hw/arm/virt.h | 1 + 2 files changed, 27 insertions(+) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 93ab9d21ea..81eda46b0b 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1367,6 +1367,7 @@ static void create_pcie(VirtMachineState *vms) } pci = PCI_HOST_BRIDGE(dev); + pci->bypass_iommu = vms->default_bus_bypass_iommu; vms->bus = pci->bus; if (vms->bus) { for (i = 0; i < nb_nics; i++) { @@ -2322,6 +2323,21 @@ static void virt_set_iommu(Object *obj, const char *value, Error **errp) } } +static bool virt_get_default_bus_bypass_iommu(Object *obj, Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + + return vms->default_bus_bypass_iommu; +} + +static void virt_set_default_bus_bypass_iommu(Object *obj, bool value, + Error **errp) +{ + VirtMachineState *vms = VIRT_MACHINE(obj); + + vms->default_bus_bypass_iommu = value; +} + static CpuInstanceProperties virt_cpu_index_to_props(MachineState *ms, unsigned cpu_index) { @@ -2661,6 +2677,13 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) "Set the IOMMU type. " "Valid values are none and smmuv3"); + object_class_property_add_bool(oc, "default_bus_bypass_iommu", + virt_get_default_bus_bypass_iommu, + virt_set_default_bus_bypass_iommu); + object_class_property_set_description(oc, "default_bus_bypass_iommu", + "Set on/off to enable/disable " + "bypass_iommu for default root bus"); + object_class_property_add_bool(oc, "ras", virt_get_ras, virt_set_ras); object_class_property_set_description(oc, "ras", @@ -2728,6 +2751,9 @@ static void virt_instance_init(Object *obj) /* Default disallows iommu instantiation */ vms->iommu = VIRT_IOMMU_NONE; + /* The default root bus is attached to iommu by default */ + vms->default_bus_bypass_iommu = false; + /* Default disallows RAS instantiation */ vms->ras = false; diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 921416f918..9661c46699 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -147,6 +147,7 @@ struct VirtMachineState { OnOffAuto acpi; VirtGICType gic_version; VirtIOMMUType iommu; + bool default_bus_bypass_iommu; VirtMSIControllerType msi_controller; uint16_t virtio_iommu_bdf; struct arm_boot_info bootinfo; From c9e96b04fc192fb38622694947dae091bbbdf28f Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Thu, 8 Jul 2021 12:55:14 +0000 Subject: [PATCH 13/19] hw/i386: Add a default_bus_bypass_iommu pc machine option Add a default_bus_bypass_iommu pc machine option to enable/disable bypass_iommu for default root bus. The option is disabled by default and can be enabled with: $QEMU -machine q35,default_bus_bypass_iommu=true Signed-off-by: Xingang Wang Message-Id: <1625748919-52456-5-git-send-email-wangxingang5@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 20 ++++++++++++++++++++ hw/pci-host/q35.c | 2 ++ include/hw/i386/pc.h | 1 + 3 files changed, 23 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index f4c7a78362..c2b9d62a35 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1524,6 +1524,21 @@ static void pc_machine_set_hpet(Object *obj, bool value, Error **errp) pcms->hpet_enabled = value; } +static bool pc_machine_get_default_bus_bypass_iommu(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->default_bus_bypass_iommu; +} + +static void pc_machine_set_default_bus_bypass_iommu(Object *obj, bool value, + Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->default_bus_bypass_iommu = value; +} + static void pc_machine_get_max_ram_below_4g(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) @@ -1623,6 +1638,7 @@ static void pc_machine_initfn(Object *obj) #ifdef CONFIG_HPET pcms->hpet_enabled = true; #endif + pcms->default_bus_bypass_iommu = false; pc_system_flash_create(pcms); pcms->pcspk = isa_new(TYPE_PC_SPEAKER); @@ -1747,6 +1763,10 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) object_class_property_add_bool(oc, "hpet", pc_machine_get_hpet, pc_machine_set_hpet); + object_class_property_add_bool(oc, "default_bus_bypass_iommu", + pc_machine_get_default_bus_bypass_iommu, + pc_machine_set_default_bus_bypass_iommu); + object_class_property_add(oc, PC_MACHINE_MAX_FW_SIZE, "size", pc_machine_get_max_fw_size, pc_machine_set_max_fw_size, NULL, NULL); diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index 0f37cf056a..ab5a47aff5 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -65,6 +65,8 @@ static void q35_host_realize(DeviceState *dev, Error **errp) s->mch.address_space_io, 0, TYPE_PCIE_BUS); PC_MACHINE(qdev_get_machine())->bus = pci->bus; + pci->bypass_iommu = + PC_MACHINE(qdev_get_machine())->default_bus_bypass_iommu; qdev_realize(DEVICE(&s->mch), BUS(pci->bus), &error_fatal); } diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 0775f945d7..88dffe7517 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -44,6 +44,7 @@ typedef struct PCMachineState { bool sata_enabled; bool pit_enabled; bool hpet_enabled; + bool default_bus_bypass_iommu; uint64_t max_fw_size; /* NUMA information: */ From 500db1daf3079c3d0222bd3963cf4e5bf6175175 Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Thu, 8 Jul 2021 12:55:15 +0000 Subject: [PATCH 14/19] hw/pci: Add pci_bus_range() to get PCI bus number range This helps to get the min and max bus number of a PCI bus hierarchy. Signed-off-by: Xingang Wang Reviewed-by: Eric Auger Message-Id: <1625748919-52456-6-git-send-email-wangxingang5@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci.c | 16 ++++++++++++++++ include/hw/pci/pci.h | 1 + 2 files changed, 17 insertions(+) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 27d588e268..23d2ae2ab2 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -537,6 +537,22 @@ int pci_bus_num(PCIBus *s) return PCI_BUS_GET_CLASS(s)->bus_num(s); } +/* Returns the min and max bus numbers of a PCI bus hierarchy */ +void pci_bus_range(PCIBus *bus, int *min_bus, int *max_bus) +{ + int i; + *min_bus = *max_bus = pci_bus_num(bus); + + for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) { + PCIDevice *dev = bus->devices[i]; + + if (dev && PCI_DEVICE_GET_CLASS(dev)->is_bridge) { + *min_bus = MIN(*min_bus, dev->config[PCI_SECONDARY_BUS]); + *max_bus = MAX(*max_bus, dev->config[PCI_SUBORDINATE_BUS]); + } + } +} + int pci_bus_numa_node(PCIBus *bus) { return PCI_BUS_GET_CLASS(bus)->numa_node(bus); diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index f4d51b672b..d0f4266e37 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -450,6 +450,7 @@ static inline PCIBus *pci_get_bus(const PCIDevice *dev) return PCI_BUS(qdev_get_parent_bus(DEVICE(dev))); } int pci_bus_num(PCIBus *s); +void pci_bus_range(PCIBus *bus, int *min_bus, int *max_bus); static inline int pci_dev_bus_num(const PCIDevice *dev) { return pci_bus_num(pci_get_bus(dev)); From 42e0f050e3a513f070b1c2e885890ac2095337e8 Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Thu, 8 Jul 2021 12:55:16 +0000 Subject: [PATCH 15/19] hw/arm/virt-acpi-build: Add IORT support to bypass SMMUv3 When we build IORT table with SMMUv3 and bypass iommu feature enabled, we can no longer setup one map from RC to SMMUv3 covering the whole RIDs. We need to walk the PCI bus and check whether the root bus will bypass iommu, setup RC -> SMMUv3 -> ITS map for RC which will not bypass iommu. When a SMMUv3 node exist, we setup the idmap from SMMUv3 to ITS covering the whole RIDs, and only modify the map from RC to SMMUv3. We build RC -> SMMUv3 -> ITS map for root bus with bypass_iommu disabled, and build idmap from RC to ITS directly for the rest of the whole RID space. For example we run qemu with command line: qemu/build/aarch64-softmmu/qemu-system-aarch64 \ -kernel arch/arm64/boot/Image \ -enable-kvm \ -cpu host \ -m 8G \ -smp 8,sockets=2,cores=4,threads=1 \ -machine virt,kernel_irqchip=on,gic-version=3,iommu=smmuv3,default_bus_bypass_iommu=true \ -drive file=./QEMU_EFI-pflash.raw,if=pflash,format=raw,unit=0,readonly=on \ -device pxb-pcie,bus_nr=0x10,id=pci.10,bus=pcie.0,addr=0x3.0x1 \ -device pxb-pcie,bus_nr=0x20,id=pci.20,bus=pcie.0,addr=0x3.0x2,bypass_iommu=true \ -device pcie-root-port,port=0x20,chassis=1,id=pci.1,bus=pcie.0,addr=0x2 \ -device pcie-root-port,port=0x20,chassis=11,id=pci.11,bus=pci.10,addr=0x1 \ -device pcie-root-port,port=0x20,chassis=21,id=pci.21,bus=pci.20,addr=0x1 \ -device virtio-scsi-pci,id=scsi0,bus=pci.1,addr=0x1 \ -device virtio-scsi-pci,id=scsi1,bus=pci.11,addr=0x1 \ -device virtio-scsi-pci,id=scsi2,bus=pci.21,addr=0x1 \ -initrd /mnt/davinci/wxg/kill-linux/rootfs/mfs.cpio.gz \ -nographic \ -append "rdinit=init console=ttyAMA0 earlycon=pl011,0x9000000 nokaslr" \ And we get guest configuration: -+-[0000:20]---01.0-[21]-- +-[0000:10]---01.0-[11]-- \-[0000:00]-+-00.0 Device 1b36:0008 +-01.0 Device 1af4:1000 \-02.0-[01]-- With bypass_iommu enabled, the attached devices will bypass iommu. /sys/class/iommu/smmu3.0x0000000009050000/ |-- device -> ../../../arm-smmu-v3.0.auto |-- devices | `-- 0000:10:01.0 -> ../../../../../pci0000:10/0000:10:01.0 Signed-off-by: Xingang Wang Message-Id: <1625748919-52456-7-git-send-email-wangxingang5@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/arm/virt-acpi-build.c | 114 +++++++++++++++++++++++++++++++++++---- 1 file changed, 103 insertions(+), 11 deletions(-) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index f1024843dd..037cc1fd82 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -44,6 +44,7 @@ #include "hw/acpi/tpm.h" #include "hw/pci/pcie_host.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" #include "hw/pci-host/gpex.h" #include "hw/arm/virt.h" #include "hw/mem/nvdimm.h" @@ -239,23 +240,89 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState *vms) } #endif +/* Build the iort ID mapping to SMMUv3 for a given PCI host bridge */ +static int +iort_host_bridges(Object *obj, void *opaque) +{ + GArray *idmap_blob = opaque; + + if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { + PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; + + if (bus && !pci_bus_bypass_iommu(bus)) { + int min_bus, max_bus; + + pci_bus_range(bus, &min_bus, &max_bus); + + AcpiIortIdMapping idmap = { + .input_base = min_bus << 8, + .id_count = (max_bus - min_bus + 1) << 8, + }; + g_array_append_val(idmap_blob, idmap); + } + } + + return 0; +} + +static int iort_idmap_compare(gconstpointer a, gconstpointer b) +{ + AcpiIortIdMapping *idmap_a = (AcpiIortIdMapping *)a; + AcpiIortIdMapping *idmap_b = (AcpiIortIdMapping *)b; + + return idmap_a->input_base - idmap_b->input_base; +} + static void build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { - int nb_nodes, iort_start = table_data->len; + int i, nb_nodes, rc_mapping_count, iort_start = table_data->len; AcpiIortIdMapping *idmap; AcpiIortItsGroup *its; AcpiIortTable *iort; AcpiIortSmmu3 *smmu; size_t node_size, iort_node_offset, iort_length, smmu_offset = 0; AcpiIortRC *rc; + GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); + GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping)); iort = acpi_data_push(table_data, sizeof(*iort)); if (vms->iommu == VIRT_IOMMU_SMMUV3) { + AcpiIortIdMapping next_range = {0}; + + object_child_foreach_recursive(object_get_root(), + iort_host_bridges, smmu_idmaps); + + /* Sort the smmu idmap by input_base */ + g_array_sort(smmu_idmaps, iort_idmap_compare); + + /* + * Split the whole RIDs by mapping from RC to SMMU, + * build the ID mapping from RC to ITS directly. + */ + for (i = 0; i < smmu_idmaps->len; i++) { + idmap = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); + + if (next_range.input_base < idmap->input_base) { + next_range.id_count = idmap->input_base - next_range.input_base; + g_array_append_val(its_idmaps, next_range); + } + + next_range.input_base = idmap->input_base + idmap->id_count; + } + + /* Append the last RC -> ITS ID mapping */ + if (next_range.input_base < 0xFFFF) { + next_range.id_count = 0xFFFF - next_range.input_base; + g_array_append_val(its_idmaps, next_range); + } + nb_nodes = 3; /* RC, ITS, SMMUv3 */ + rc_mapping_count = smmu_idmaps->len + its_idmaps->len; } else { nb_nodes = 2; /* RC, ITS */ + rc_mapping_count = 1; } iort_length = sizeof(*iort); @@ -307,13 +374,13 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } /* Root Complex Node */ - node_size = sizeof(*rc) + sizeof(*idmap); + node_size = sizeof(*rc) + sizeof(*idmap) * rc_mapping_count; iort_length += node_size; rc = acpi_data_push(table_data, node_size); rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX; rc->length = cpu_to_le16(node_size); - rc->mapping_count = cpu_to_le32(1); + rc->mapping_count = cpu_to_le32(rc_mapping_count); rc->mapping_offset = cpu_to_le32(sizeof(*rc)); /* fully coherent device */ @@ -321,20 +388,45 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) rc->memory_properties.memory_flags = 0x3; /* CCA = CPM = DCAS = 1 */ rc->pci_segment_number = 0; /* MCFG pci_segment */ - /* Identity RID mapping covering the whole input RID range */ - idmap = &rc->id_mapping_array[0]; - idmap->input_base = 0; - idmap->id_count = cpu_to_le32(0xFFFF); - idmap->output_base = 0; - if (vms->iommu == VIRT_IOMMU_SMMUV3) { - /* output IORT node is the smmuv3 node */ - idmap->output_reference = cpu_to_le32(smmu_offset); + AcpiIortIdMapping *range; + + /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */ + for (i = 0; i < smmu_idmaps->len; i++) { + idmap = &rc->id_mapping_array[i]; + range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i); + + idmap->input_base = cpu_to_le32(range->input_base); + idmap->id_count = cpu_to_le32(range->id_count); + idmap->output_base = cpu_to_le32(range->input_base); + /* output IORT node is the smmuv3 node */ + idmap->output_reference = cpu_to_le32(smmu_offset); + } + + /* bypassed RIDs connect to ITS group node directly: RC -> ITS */ + for (i = 0; i < its_idmaps->len; i++) { + idmap = &rc->id_mapping_array[smmu_idmaps->len + i]; + range = &g_array_index(its_idmaps, AcpiIortIdMapping, i); + + idmap->input_base = cpu_to_le32(range->input_base); + idmap->id_count = cpu_to_le32(range->id_count); + idmap->output_base = cpu_to_le32(range->input_base); + /* output IORT node is the ITS group node (the first node) */ + idmap->output_reference = cpu_to_le32(iort_node_offset); + } } else { + /* Identity RID mapping covering the whole input RID range */ + idmap = &rc->id_mapping_array[0]; + idmap->input_base = cpu_to_le32(0); + idmap->id_count = cpu_to_le32(0xFFFF); + idmap->output_base = cpu_to_le32(0); /* output IORT node is the ITS group node (the first node) */ idmap->output_reference = cpu_to_le32(iort_node_offset); } + g_array_free(smmu_idmaps, true); + g_array_free(its_idmaps, true); + /* * Update the pointer address in case table_data->data moves during above * acpi_data_push operations. From 26863366b29ab638711bbe0fa741dcf355eddb54 Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Thu, 8 Jul 2021 12:55:17 +0000 Subject: [PATCH 16/19] hw/i386/acpi-build: Add DMAR support to bypass iommu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In DMAR table, the drhd is set to cover all PCI devices when intel_iommu is on. To support bypass iommu feature, we need to walk the PCI bus with bypass_iommu disabled and add explicit scope data in DMAR drhd structure. /mnt/sdb/wxg/qemu-next/qemu/build/x86_64-softmmu/qemu-system-x86_64 \ -machine q35,accel=kvm,default_bus_bypass_iommu=true \ -cpu host \ -m 16G \ -smp 36,sockets=2,cores=18,threads=1 \ -device pxb-pcie,bus_nr=0x10,id=pci.10,bus=pcie.0,addr=0x3 \ -device pxb-pcie,bus_nr=0x20,id=pci.20,bus=pcie.0,addr=0x4,bypass_iommu=true \ -device pcie-root-port,port=0x1,chassis=1,id=pci.11,bus=pci.10,addr=0x0 \ -device pcie-root-port,port=0x2,chassis=2,id=pci.21,bus=pci.20,addr=0x0 \ -device virtio-scsi-pci,id=scsi0,bus=pci.11,addr=0x0 \ -device virtio-scsi-pci,id=scsi1,bus=pci.21,addr=0x0 \ -drive file=/mnt/sdb/wxg/fedora-48g.qcow2,format=qcow2,if=none,id=drive-scsi0-0-0-0,cache=none,aio=native \ -device scsi-hd,bus=scsi1.0,channel=0,scsi-id=0,lun=0,drive=drive-scsi0-0-0-0,id=scsi0-0-0-0,bootindex=1 \ -device intel-iommu \ -nographic \ And we get the guest configuration: ~ lspci -vt -+-[0000:20]---00.0-[21]----00.0 Red Hat, Inc. Virtio SCSI +-[0000:10]---00.0-[11]----00.0 Red Hat, Inc. Virtio SCSI \-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller +-01.0 Device 1234:1111 +-02.0 Intel Corporation 82574L Gigabit Network Connection +-03.0 Red Hat, Inc. QEMU PCIe Expander bridge +-04.0 Red Hat, Inc. QEMU PCIe Expander bridge +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode] \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller With bypass_iommu enabled on root bus, the attached devices will bypass iommu: /sys/class/iommu/dmar0 ├── devices │   ├── 0000:10:00.0 -> ../../../../pci0000:10/0000:10:00.0 │   └── 0000:11:00.0 -> ../../../../pci0000:10/0000:10:00.0/0000:11:00.0 Signed-off-by: Xingang Wang Message-Id: <1625748919-52456-8-git-send-email-wangxingang5@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 68 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index bc966a4110..7efc6285ac 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2022,6 +2022,56 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) x86ms->oem_table_id); } +/* + * Insert DMAR scope for PCI bridges and endpoint devcie + */ +static void +insert_scope(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + GArray *scope_blob = opaque; + AcpiDmarDeviceScope *scope = NULL; + + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) { + /* Dmar Scope Type: 0x02 for PCI Bridge */ + build_append_int_noprefix(scope_blob, 0x02, 1); + } else { + /* Dmar Scope Type: 0x01 for PCI Endpoint Device */ + build_append_int_noprefix(scope_blob, 0x01, 1); + } + + /* length */ + build_append_int_noprefix(scope_blob, + sizeof(*scope) + sizeof(scope->path[0]), 1); + /* reserved */ + build_append_int_noprefix(scope_blob, 0, 2); + /* enumeration_id */ + build_append_int_noprefix(scope_blob, 0, 1); + /* bus */ + build_append_int_noprefix(scope_blob, pci_bus_num(bus), 1); + /* device */ + build_append_int_noprefix(scope_blob, PCI_SLOT(dev->devfn), 1); + /* function */ + build_append_int_noprefix(scope_blob, PCI_FUNC(dev->devfn), 1); +} + +/* For a given PCI host bridge, walk and insert DMAR scope */ +static int +dmar_host_bridges(Object *obj, void *opaque) +{ + GArray *scope_blob = opaque; + + if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { + PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; + + if (bus && !pci_bus_bypass_iommu(bus)) { + pci_for_each_device(bus, pci_bus_num(bus), insert_scope, + scope_blob); + } + } + + return 0; +} + /* * VT-d spec 8.1 DMA Remapping Reporting Structure * (version Oct. 2014 or later) @@ -2041,6 +2091,15 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker, const char *oem_id, /* Root complex IOAPIC use one path[0] only */ size_t ioapic_scope_size = sizeof(*scope) + sizeof(scope->path[0]); IntelIOMMUState *intel_iommu = INTEL_IOMMU_DEVICE(iommu); + GArray *scope_blob = g_array_new(false, true, 1); + + /* + * A PCI bus walk, for each PCI host bridge. + * Insert scope for each PCI bridge and endpoint device which + * is attached to a bus with iommu enabled. + */ + object_child_foreach_recursive(object_get_root(), + dmar_host_bridges, scope_blob); assert(iommu); if (x86_iommu_ir_supported(iommu)) { @@ -2054,8 +2113,9 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker, const char *oem_id, /* DMAR Remapping Hardware Unit Definition structure */ drhd = acpi_data_push(table_data, sizeof(*drhd) + ioapic_scope_size); drhd->type = cpu_to_le16(ACPI_DMAR_TYPE_HARDWARE_UNIT); - drhd->length = cpu_to_le16(sizeof(*drhd) + ioapic_scope_size); - drhd->flags = ACPI_DMAR_INCLUDE_PCI_ALL; + drhd->length = + cpu_to_le16(sizeof(*drhd) + ioapic_scope_size + scope_blob->len); + drhd->flags = 0; /* Don't include all pci device */ drhd->pci_segment = cpu_to_le16(0); drhd->address = cpu_to_le64(Q35_HOST_BRIDGE_IOMMU_ADDR); @@ -2069,6 +2129,10 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker, const char *oem_id, scope->path[0].device = PCI_SLOT(Q35_PSEUDO_DEVFN_IOAPIC); scope->path[0].function = PCI_FUNC(Q35_PSEUDO_DEVFN_IOAPIC); + /* Add scope found above */ + g_array_append_vals(table_data, scope_blob->data, scope_blob->len); + g_array_free(scope_blob, true); + if (iommu->dt_supported) { atsr = acpi_data_push(table_data, sizeof(*atsr)); atsr->type = cpu_to_le16(ACPI_DMAR_TYPE_ATSR); From dec2f5636e9b590a5d2d2e20e821e9899f7c63bf Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Thu, 8 Jul 2021 12:55:18 +0000 Subject: [PATCH 17/19] hw/i386/acpi-build: Add IVRS support to bypass iommu Check bypass_iommu to exclude the devices which will bypass iommu. Signed-off-by: Xingang Wang Message-Id: <1625748919-52456-9-git-send-email-wangxingang5@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 7efc6285ac..17836149fe 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2263,7 +2263,7 @@ ivrs_host_bridges(Object *obj, void *opaque) if (object_dynamic_cast(obj, TYPE_PCI_HOST_BRIDGE)) { PCIBus *bus = PCI_HOST_BRIDGE(obj)->bus; - if (bus) { + if (bus && !pci_bus_bypass_iommu(bus)) { pci_for_each_device(bus, pci_bus_num(bus), insert_ivhd, ivhd_blob); } } From 732de3e72d8872b34b82afb3a5b063b2e0c1c85b Mon Sep 17 00:00:00 2001 From: Xingang Wang Date: Thu, 8 Jul 2021 12:55:19 +0000 Subject: [PATCH 18/19] docs: Add documentation for iommu bypass Signed-off-by: Xingang Wang Message-Id: <1625748919-52456-10-git-send-email-wangxingang5@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/bypass-iommu.txt | 89 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 docs/bypass-iommu.txt diff --git a/docs/bypass-iommu.txt b/docs/bypass-iommu.txt new file mode 100644 index 0000000000..e6677bddd3 --- /dev/null +++ b/docs/bypass-iommu.txt @@ -0,0 +1,89 @@ +BYPASS IOMMU PROPERTY +===================== + +Description +=========== +Traditionally, there is a global switch to enable/disable vIOMMU. All +devices in the system can only support go through vIOMMU or not, which +is not flexible. We introduce this bypass iommu property to support +coexist of devices go through vIOMMU and devices not. This is useful to +passthrough devices with no-iommu mode and devices go through vIOMMU in +the same virtual machine. + +PCI host bridges have a bypass_iommu property. This property is used to +determine whether the devices attached on the PCI host bridge will bypass +virtual iommu. The bypass_iommu property is valid only when there is a +virtual iommu in the system, it is implemented to allow some devices to +bypass vIOMMU. When bypass_iommu property is not set for a host bridge, +the attached devices will go through vIOMMU by default. + +Usage +===== +The bypass iommu feature support PXB host bridge and default main host +bridge, we add a bypass_iommu property for PXB and default_bus_bypass_iommu +for machine. Note that default_bus_bypass_iommu is available only when +the 'q35' machine type on x86 architecture and the 'virt' machine type +on AArch64. Other machine types do not support bypass iommu for default +root bus. + +1. The following is the bypass iommu options: + (1) PCI expander bridge + qemu -device pxb-pcie,bus_nr=0x10,addr=0x1,bypass_iommu=true + (2) Arm default host bridge + qemu -machine virt,iommu=smmuv3,default_bus_bypass_iommu=true + (3) X86 default root bus bypass iommu: + qemu -machine q35,default_bus_bypass_iommu=true + +2. Here is the detailed qemu command line for 'virt' machine with PXB on +AArch64: + +qemu-system-aarch64 \ + -machine virt,kernel_irqchip=on,iommu=smmuv3,default_bus_bypass_iommu=true \ + -device pxb-pcie,bus_nr=0x10,id=pci.10,bus=pcie.0,addr=0x3.0x1 \ + -device pxb-pcie,bus_nr=0x20,id=pci.20,bus=pcie.0,addr=0x3.0x2,bypass_iommu=true \ + +And we got: + - a default host bridge which bypass SMMUv3 + - a pxb host bridge which go through SMMUv3 + - a pxb host bridge which bypass SMMUv3 + +3. Here is the detailed qemu command line for 'q35' machine with PXB on +x86 architecture: + +qemu-system-x86_64 \ + -machine q35,accel=kvm,default_bus_bypass_iommu=true \ + -device pxb-pcie,bus_nr=0x10,id=pci.10,bus=pcie.0,addr=0x3 \ + -device pxb-pcie,bus_nr=0x20,id=pci.20,bus=pcie.0,addr=0x4,bypass_iommu=true \ + -device intel-iommu \ + +And we got: + - a default host bridge which bypass iommu + - a pxb host bridge which go through iommu + - a pxb host bridge which bypass iommu + +Limitations +=========== +There might be potential security risk when devices bypass iommu, because +devices might send malicious dma request to virtual machine if there is no +iommu isolation. So it would be necessary to only bypass iommu for trusted +device. + +Implementation +============== +The bypass iommu feature includes: + - Address space + Add bypass iommu property check of PCI Host and do not get iommu address + space for devices bypass iommu. + - Arm SMMUv3 support + We traverse all PCI root bus and get bus number ranges, then build explicit + RID mapping for devices which do not bypass iommu. + - X86 IOMMU support + To support Intel iommu, we traverse all PCI host bridge and get information + of devices which do not bypass iommu, then fill the DMAR drhd struct with + explicit device scope info. To support AMD iommu, add check of bypass iommu + when traverse the PCI hsot bridge. + - Machine and PXB options + We add bypass iommu options in machine option for default root bus, and add + option for PXB also. Note that the default value of bypass iommu is false, + so that the devices will by default go through iommu if there exist one. + From 1e08fd0a465d70ad30d2928c66537c816f0af7f8 Mon Sep 17 00:00:00 2001 From: Arseny Krasnov Date: Tue, 22 Jun 2021 17:47:47 +0300 Subject: [PATCH 19/19] vhost-vsock: SOCK_SEQPACKET feature bit support This adds processing of VIRTIO_VSOCK_F_SEQPACKET features bit. Guest negotiates it with vhost, thus both will know that SOCK_SEQPACKET supported by peer. Signed-off-by: Arseny Krasnov Message-Id: <20210622144747.2949134-1-arseny.krasnov@kaspersky.com> Reviewed-by: Stefano Garzarella Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-vsock.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c index 777cafe70d..1b1a5c70ed 100644 --- a/hw/virtio/vhost-vsock.c +++ b/hw/virtio/vhost-vsock.c @@ -21,6 +21,11 @@ #include "hw/virtio/vhost-vsock.h" #include "monitor/monitor.h" +const int feature_bits[] = { + VIRTIO_VSOCK_F_SEQPACKET, + VHOST_INVALID_FEATURE_BIT +}; + static void vhost_vsock_get_config(VirtIODevice *vdev, uint8_t *config) { VHostVSock *vsock = VHOST_VSOCK(vdev); @@ -108,8 +113,11 @@ static uint64_t vhost_vsock_get_features(VirtIODevice *vdev, uint64_t requested_features, Error **errp) { - /* No feature bits used yet */ - return requested_features; + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + virtio_add_feature(&requested_features, VIRTIO_VSOCK_F_SEQPACKET); + return vhost_get_features(&vvc->vhost_dev, feature_bits, + requested_features); } static const VMStateDescription vmstate_virtio_vhost_vsock = {