vfio: vfio-pci device assignment driver
This adds the core of the QEMU VFIO-based PCI device assignment driver.
To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
and CONFIG_VFIO_PCI in your host Linux kernel config. Load the vfio-pci
module. To assign device 0000:05:00.0 to a guest, do the following:
for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
device=$(cat /sys/bus/pci/devices/$dev/device)
if [ -e /sys/bus/pci/devices/$dev/driver ]; then
echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
fi
echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
done
See Documentation/vfio.txt in the Linux kernel tree for further
description of IOMMU groups and VFIO.
Then launch qemu including the option:
-device vfio-pci,host=0000:05:00.0
Legacy PCI interrupts (INTx) currently makes use of a kludge where we
trap BAR accesses and assume the access is in response to an interrupt,
therefore de-asserting and unmasking the interrupt. It's not quite as
targetted as using the EOI for this, but it's self contained and seems
to work across all architectures. The side-effect is a significant
performance slow-down for device in INTx mode. Some devices, like
graphics cards, don't really use their interrupt, so this can be turned
off with the x-intx=off option, which disables INTx alltogether. This
should be considered an experimental option until we refine this code.
Both MSI and MSI-X are supported and avoid these issues.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2012-09-27 01:19:32 +08:00
|
|
|
/*
|
|
|
|
* vfio based device assignment support
|
|
|
|
*
|
|
|
|
* Copyright Red Hat, Inc. 2012
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Alex Williamson <alex.williamson@redhat.com>
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2. See
|
|
|
|
* the COPYING file in the top-level directory.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef HW_VFIO_PCI_INT_H
|
|
|
|
#define HW_VFIO_PCI_INT_H
|
|
|
|
|
|
|
|
#include "qemu-common.h"
|
|
|
|
#include "qemu-queue.h"
|
|
|
|
#include "pci.h"
|
|
|
|
#include "event_notifier.h"
|
|
|
|
|
|
|
|
typedef struct VFIOBAR {
|
|
|
|
off_t fd_offset; /* offset of BAR within device fd */
|
|
|
|
int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
|
|
|
|
MemoryRegion mem; /* slow, read/write access */
|
|
|
|
MemoryRegion mmap_mem; /* direct mapped access */
|
|
|
|
void *mmap;
|
|
|
|
size_t size;
|
|
|
|
uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
|
|
|
|
uint8_t nr; /* cache the BAR number for debug */
|
|
|
|
} VFIOBAR;
|
|
|
|
|
|
|
|
typedef struct VFIOINTx {
|
|
|
|
bool pending; /* interrupt pending */
|
|
|
|
bool kvm_accel; /* set when QEMU bypass through KVM enabled */
|
|
|
|
uint8_t pin; /* which pin to pull for qemu_set_irq */
|
|
|
|
EventNotifier interrupt; /* eventfd triggered on interrupt */
|
|
|
|
EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
|
|
|
|
PCIINTxRoute route; /* routing info for QEMU bypass */
|
vfio-pci: Update slow path INTx algorithm
We can't afford the overhead of switching out and back into mmap mode
around each interrupt, but we can do it lazily via a timer. On INTx
interrupt, disable the mmap'd memory regions and set a timer. On
every interrupt, push the timer out. If the timer expires and the
interrupt is no longer pending, switch back to mmap mode.
This has the benefit that things like graphics cards, which rarely or
never, fire an interrupt don't need manual user intervention to add
the x-intx=off parameter. They'll just remain in mmap mode until they
trigger an interrupt, and if they don't continue to regularly fire
interrupts, they'll switch back.
The default timeout is tuned for network cards so that a ping is just
enough to keep them in non-mmap mode, where they have much better
latency. It is tunable with an experimental option,
x-intx-mmap-timeout-ms. A value of 0 keeps the device in non-mmap
mode after the first interrupt.
It's possible we could look at the class code of devices and come up
with reasonable per-class defaults based on expected interrupt
frequency and latency. None of this is used for MSI interrupts and
also won't be used if we can bypass through KVM.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2012-10-08 22:45:29 +08:00
|
|
|
uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */
|
|
|
|
QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
|
vfio: vfio-pci device assignment driver
This adds the core of the QEMU VFIO-based PCI device assignment driver.
To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
and CONFIG_VFIO_PCI in your host Linux kernel config. Load the vfio-pci
module. To assign device 0000:05:00.0 to a guest, do the following:
for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
device=$(cat /sys/bus/pci/devices/$dev/device)
if [ -e /sys/bus/pci/devices/$dev/driver ]; then
echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
fi
echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
done
See Documentation/vfio.txt in the Linux kernel tree for further
description of IOMMU groups and VFIO.
Then launch qemu including the option:
-device vfio-pci,host=0000:05:00.0
Legacy PCI interrupts (INTx) currently makes use of a kludge where we
trap BAR accesses and assume the access is in response to an interrupt,
therefore de-asserting and unmasking the interrupt. It's not quite as
targetted as using the EOI for this, but it's self contained and seems
to work across all architectures. The side-effect is a significant
performance slow-down for device in INTx mode. Some devices, like
graphics cards, don't really use their interrupt, so this can be turned
off with the x-intx=off option, which disables INTx alltogether. This
should be considered an experimental option until we refine this code.
Both MSI and MSI-X are supported and avoid these issues.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2012-09-27 01:19:32 +08:00
|
|
|
} VFIOINTx;
|
|
|
|
|
|
|
|
struct VFIODevice;
|
|
|
|
|
|
|
|
typedef struct VFIOMSIVector {
|
|
|
|
EventNotifier interrupt; /* eventfd triggered on interrupt */
|
|
|
|
struct VFIODevice *vdev; /* back pointer to device */
|
|
|
|
int virq; /* KVM irqchip route for QEMU bypass */
|
|
|
|
bool use;
|
|
|
|
} VFIOMSIVector;
|
|
|
|
|
|
|
|
enum {
|
|
|
|
VFIO_INT_NONE = 0,
|
|
|
|
VFIO_INT_INTx = 1,
|
|
|
|
VFIO_INT_MSI = 2,
|
|
|
|
VFIO_INT_MSIX = 3,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct VFIOGroup;
|
|
|
|
|
|
|
|
typedef struct VFIOContainer {
|
|
|
|
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
|
|
|
|
struct {
|
|
|
|
/* enable abstraction to support various iommu backends */
|
|
|
|
union {
|
|
|
|
MemoryListener listener; /* Used by type1 iommu */
|
|
|
|
};
|
|
|
|
void (*release)(struct VFIOContainer *);
|
|
|
|
} iommu_data;
|
|
|
|
QLIST_HEAD(, VFIOGroup) group_list;
|
|
|
|
QLIST_ENTRY(VFIOContainer) next;
|
|
|
|
} VFIOContainer;
|
|
|
|
|
|
|
|
/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
|
|
|
|
typedef struct VFIOMSIXInfo {
|
|
|
|
uint8_t table_bar;
|
|
|
|
uint8_t pba_bar;
|
|
|
|
uint16_t entries;
|
|
|
|
uint32_t table_offset;
|
|
|
|
uint32_t pba_offset;
|
|
|
|
MemoryRegion mmap_mem;
|
|
|
|
void *mmap;
|
|
|
|
} VFIOMSIXInfo;
|
|
|
|
|
|
|
|
typedef struct VFIODevice {
|
|
|
|
PCIDevice pdev;
|
|
|
|
int fd;
|
|
|
|
VFIOINTx intx;
|
|
|
|
unsigned int config_size;
|
|
|
|
off_t config_offset; /* Offset of config space region within device fd */
|
|
|
|
unsigned int rom_size;
|
|
|
|
off_t rom_offset; /* Offset of ROM region within device fd */
|
|
|
|
int msi_cap_size;
|
|
|
|
VFIOMSIVector *msi_vectors;
|
|
|
|
VFIOMSIXInfo *msix;
|
|
|
|
int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
|
|
|
|
int interrupt; /* Current interrupt type */
|
|
|
|
VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
|
|
|
|
PCIHostDeviceAddress host;
|
|
|
|
QLIST_ENTRY(VFIODevice) next;
|
|
|
|
struct VFIOGroup *group;
|
|
|
|
bool reset_works;
|
|
|
|
} VFIODevice;
|
|
|
|
|
|
|
|
typedef struct VFIOGroup {
|
|
|
|
int fd;
|
|
|
|
int groupid;
|
|
|
|
VFIOContainer *container;
|
|
|
|
QLIST_HEAD(, VFIODevice) device_list;
|
|
|
|
QLIST_ENTRY(VFIOGroup) next;
|
|
|
|
QLIST_ENTRY(VFIOGroup) container_next;
|
|
|
|
} VFIOGroup;
|
|
|
|
|
|
|
|
#endif /* HW_VFIO_PCI_INT_H */
|