qemu/hw/virtio/virtio-iommu.c

1226 lines
39 KiB
C
Raw Normal View History

/*
* virtio-iommu device
*
* Copyright (c) 2020 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2 or later, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "qemu/osdep.h"
#include "qemu/log.h"
#include "qemu/iov.h"
#include "qemu-common.h"
#include "hw/qdev-properties.h"
#include "hw/virtio/virtio.h"
#include "sysemu/kvm.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "trace.h"
#include "standard-headers/linux/virtio_ids.h"
#include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-access.h"
#include "hw/virtio/virtio-iommu.h"
#include "hw/pci/pci_bus.h"
#include "hw/pci/pci.h"
/* Max size */
#define VIOMMU_DEFAULT_QUEUE_SIZE 256
#define VIOMMU_PROBE_SIZE 512
typedef struct VirtIOIOMMUDomain {
uint32_t id;
GTree *mappings;
QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list;
} VirtIOIOMMUDomain;
typedef struct VirtIOIOMMUEndpoint {
uint32_t id;
VirtIOIOMMUDomain *domain;
IOMMUMemoryRegion *iommu_mr;
QLIST_ENTRY(VirtIOIOMMUEndpoint) next;
} VirtIOIOMMUEndpoint;
typedef struct VirtIOIOMMUInterval {
uint64_t low;
uint64_t high;
} VirtIOIOMMUInterval;
typedef struct VirtIOIOMMUMapping {
uint64_t phys_addr;
uint32_t flags;
} VirtIOIOMMUMapping;
static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
{
return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
}
/**
* The bus number is used for lookup when SID based operations occur.
* In that case we lazily populate the IOMMUPciBus array from the bus hash
* table. At the time the IOMMUPciBus is created (iommu_find_add_as), the bus
* numbers may not be always initialized yet.
*/
static IOMMUPciBus *iommu_find_iommu_pcibus(VirtIOIOMMU *s, uint8_t bus_num)
{
IOMMUPciBus *iommu_pci_bus = s->iommu_pcibus_by_bus_num[bus_num];
if (!iommu_pci_bus) {
GHashTableIter iter;
g_hash_table_iter_init(&iter, s->as_by_busptr);
while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
if (pci_bus_num(iommu_pci_bus->bus) == bus_num) {
s->iommu_pcibus_by_bus_num[bus_num] = iommu_pci_bus;
return iommu_pci_bus;
}
}
return NULL;
}
return iommu_pci_bus;
}
static IOMMUMemoryRegion *virtio_iommu_mr(VirtIOIOMMU *s, uint32_t sid)
{
uint8_t bus_n, devfn;
IOMMUPciBus *iommu_pci_bus;
IOMMUDevice *dev;
bus_n = PCI_BUS_NUM(sid);
iommu_pci_bus = iommu_find_iommu_pcibus(s, bus_n);
if (iommu_pci_bus) {
devfn = sid & (PCI_DEVFN_MAX - 1);
dev = iommu_pci_bus->pbdev[devfn];
if (dev) {
return &dev->iommu_mr;
}
}
return NULL;
}
static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
{
VirtIOIOMMUInterval *inta = (VirtIOIOMMUInterval *)a;
VirtIOIOMMUInterval *intb = (VirtIOIOMMUInterval *)b;
if (inta->high < intb->low) {
return -1;
} else if (intb->high < inta->low) {
return 1;
} else {
return 0;
}
}
static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start,
hwaddr virt_end, hwaddr paddr,
uint32_t flags)
{
IOMMUTLBEvent event;
IOMMUAccessFlags perm = IOMMU_ACCESS_FLAG(flags & VIRTIO_IOMMU_MAP_F_READ,
flags & VIRTIO_IOMMU_MAP_F_WRITE);
if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_MAP) ||
(flags & VIRTIO_IOMMU_MAP_F_MMIO) || !perm) {
return;
}
trace_virtio_iommu_notify_map(mr->parent_obj.name, virt_start, virt_end,
paddr, perm);
event.type = IOMMU_NOTIFIER_MAP;
event.entry.target_as = &address_space_memory;
event.entry.addr_mask = virt_end - virt_start;
event.entry.iova = virt_start;
event.entry.perm = perm;
event.entry.translated_addr = paddr;
memory_region_notify_iommu(mr, 0, event);
}
static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start,
hwaddr virt_end)
{
IOMMUTLBEvent event;
uint64_t delta = virt_end - virt_start;
if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) {
return;
}
trace_virtio_iommu_notify_unmap(mr->parent_obj.name, virt_start, virt_end);
event.type = IOMMU_NOTIFIER_UNMAP;
event.entry.target_as = &address_space_memory;
event.entry.perm = IOMMU_NONE;
event.entry.translated_addr = 0;
event.entry.addr_mask = delta;
event.entry.iova = virt_start;
if (delta == UINT64_MAX) {
memory_region_notify_iommu(mr, 0, event);
}
while (virt_start != virt_end + 1) {
uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64);
event.entry.addr_mask = mask;
event.entry.iova = virt_start;
memory_region_notify_iommu(mr, 0, event);
virt_start += mask + 1;
}
}
static gboolean virtio_iommu_notify_unmap_cb(gpointer key, gpointer value,
gpointer data)
{
VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
virtio_iommu_notify_unmap(mr, interval->low, interval->high);
return false;
}
static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value,
gpointer data)
{
VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
virtio_iommu_notify_map(mr, interval->low, interval->high,
mapping->phys_addr, mapping->flags);
return false;
}
static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
{
VirtIOIOMMUDomain *domain = ep->domain;
if (!ep->domain) {
return;
}
g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb,
ep->iommu_mr);
QLIST_REMOVE(ep, next);
ep->domain = NULL;
}
static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
uint32_t ep_id)
{
VirtIOIOMMUEndpoint *ep;
IOMMUMemoryRegion *mr;
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
if (ep) {
return ep;
}
mr = virtio_iommu_mr(s, ep_id);
if (!mr) {
return NULL;
}
ep = g_malloc0(sizeof(*ep));
ep->id = ep_id;
ep->iommu_mr = mr;
trace_virtio_iommu_get_endpoint(ep_id);
g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep);
return ep;
}
static void virtio_iommu_put_endpoint(gpointer data)
{
VirtIOIOMMUEndpoint *ep = (VirtIOIOMMUEndpoint *)data;
if (ep->domain) {
virtio_iommu_detach_endpoint_from_domain(ep);
}
trace_virtio_iommu_put_endpoint(ep->id);
g_free(ep);
}
static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s,
uint32_t domain_id)
{
VirtIOIOMMUDomain *domain;
domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
if (domain) {
return domain;
}
domain = g_malloc0(sizeof(*domain));
domain->id = domain_id;
domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
NULL, (GDestroyNotify)g_free,
(GDestroyNotify)g_free);
g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain);
QLIST_INIT(&domain->endpoint_list);
trace_virtio_iommu_get_domain(domain_id);
return domain;
}
static void virtio_iommu_put_domain(gpointer data)
{
VirtIOIOMMUDomain *domain = (VirtIOIOMMUDomain *)data;
VirtIOIOMMUEndpoint *iter, *tmp;
QLIST_FOREACH_SAFE(iter, &domain->endpoint_list, next, tmp) {
virtio_iommu_detach_endpoint_from_domain(iter);
}
g_tree_destroy(domain->mappings);
trace_virtio_iommu_put_domain(domain->id);
g_free(domain);
}
static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
int devfn)
{
VirtIOIOMMU *s = opaque;
IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
static uint32_t mr_index;
IOMMUDevice *sdev;
if (!sbus) {
sbus = g_malloc0(sizeof(IOMMUPciBus) +
sizeof(IOMMUDevice *) * PCI_DEVFN_MAX);
sbus->bus = bus;
g_hash_table_insert(s->as_by_busptr, bus, sbus);
}
sdev = sbus->pbdev[devfn];
if (!sdev) {
char *name = g_strdup_printf("%s-%d-%d",
TYPE_VIRTIO_IOMMU_MEMORY_REGION,
mr_index++, devfn);
sdev = sbus->pbdev[devfn] = g_malloc0(sizeof(IOMMUDevice));
sdev->viommu = s;
sdev->bus = bus;
sdev->devfn = devfn;
trace_virtio_iommu_init_iommu_mr(name);
memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr),
TYPE_VIRTIO_IOMMU_MEMORY_REGION,
OBJECT(s), name,
UINT64_MAX);
address_space_init(&sdev->as,
MEMORY_REGION(&sdev->iommu_mr), TYPE_VIRTIO_IOMMU);
g_free(name);
}
return &sdev->as;
}
static int virtio_iommu_attach(VirtIOIOMMU *s,
struct virtio_iommu_req_attach *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint32_t ep_id = le32_to_cpu(req->endpoint);
VirtIOIOMMUDomain *domain;
VirtIOIOMMUEndpoint *ep;
trace_virtio_iommu_attach(domain_id, ep_id);
ep = virtio_iommu_get_endpoint(s, ep_id);
if (!ep) {
return VIRTIO_IOMMU_S_NOENT;
}
if (ep->domain) {
VirtIOIOMMUDomain *previous_domain = ep->domain;
/*
* the device is already attached to a domain,
* detach it first
*/
virtio_iommu_detach_endpoint_from_domain(ep);
if (QLIST_EMPTY(&previous_domain->endpoint_list)) {
g_tree_remove(s->domains, GUINT_TO_POINTER(previous_domain->id));
}
}
domain = virtio_iommu_get_domain(s, domain_id);
QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
ep->domain = domain;
/* Replay domain mappings on the associated memory region */
g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb,
ep->iommu_mr);
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_detach(VirtIOIOMMU *s,
struct virtio_iommu_req_detach *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint32_t ep_id = le32_to_cpu(req->endpoint);
VirtIOIOMMUDomain *domain;
VirtIOIOMMUEndpoint *ep;
trace_virtio_iommu_detach(domain_id, ep_id);
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
if (!ep) {
return VIRTIO_IOMMU_S_NOENT;
}
domain = ep->domain;
if (!domain || domain->id != domain_id) {
return VIRTIO_IOMMU_S_INVAL;
}
virtio_iommu_detach_endpoint_from_domain(ep);
if (QLIST_EMPTY(&domain->endpoint_list)) {
g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id));
}
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_map(VirtIOIOMMU *s,
struct virtio_iommu_req_map *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint64_t phys_start = le64_to_cpu(req->phys_start);
uint64_t virt_start = le64_to_cpu(req->virt_start);
uint64_t virt_end = le64_to_cpu(req->virt_end);
uint32_t flags = le32_to_cpu(req->flags);
VirtIOIOMMUDomain *domain;
VirtIOIOMMUInterval *interval;
VirtIOIOMMUMapping *mapping;
VirtIOIOMMUEndpoint *ep;
if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) {
return VIRTIO_IOMMU_S_INVAL;
}
domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
if (!domain) {
return VIRTIO_IOMMU_S_NOENT;
}
interval = g_malloc0(sizeof(*interval));
interval->low = virt_start;
interval->high = virt_end;
mapping = g_tree_lookup(domain->mappings, (gpointer)interval);
if (mapping) {
g_free(interval);
return VIRTIO_IOMMU_S_INVAL;
}
trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
mapping = g_malloc0(sizeof(*mapping));
mapping->phys_addr = phys_start;
mapping->flags = flags;
g_tree_insert(domain->mappings, interval, mapping);
QLIST_FOREACH(ep, &domain->endpoint_list, next) {
virtio_iommu_notify_map(ep->iommu_mr, virt_start, virt_end, phys_start,
flags);
}
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_unmap(VirtIOIOMMU *s,
struct virtio_iommu_req_unmap *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint64_t virt_start = le64_to_cpu(req->virt_start);
uint64_t virt_end = le64_to_cpu(req->virt_end);
VirtIOIOMMUMapping *iter_val;
VirtIOIOMMUInterval interval, *iter_key;
VirtIOIOMMUDomain *domain;
VirtIOIOMMUEndpoint *ep;
int ret = VIRTIO_IOMMU_S_OK;
trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
if (!domain) {
return VIRTIO_IOMMU_S_NOENT;
}
interval.low = virt_start;
interval.high = virt_end;
while (g_tree_lookup_extended(domain->mappings, &interval,
(void **)&iter_key, (void**)&iter_val)) {
uint64_t current_low = iter_key->low;
uint64_t current_high = iter_key->high;
if (interval.low <= current_low && interval.high >= current_high) {
QLIST_FOREACH(ep, &domain->endpoint_list, next) {
virtio_iommu_notify_unmap(ep->iommu_mr, current_low,
current_high);
}
g_tree_remove(domain->mappings, iter_key);
trace_virtio_iommu_unmap_done(domain_id, current_low, current_high);
} else {
ret = VIRTIO_IOMMU_S_RANGE;
break;
}
}
return ret;
}
static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep,
uint8_t *buf, size_t free)
{
struct virtio_iommu_probe_resv_mem prop = {};
size_t size = sizeof(prop), length = size - sizeof(prop.head), total;
int i;
total = size * s->nb_reserved_regions;
if (total > free) {
return -ENOSPC;
}
for (i = 0; i < s->nb_reserved_regions; i++) {
unsigned subtype = s->reserved_regions[i].type;
assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED ||
subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI);
prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM);
prop.head.length = cpu_to_le16(length);
prop.subtype = subtype;
prop.start = cpu_to_le64(s->reserved_regions[i].low);
prop.end = cpu_to_le64(s->reserved_regions[i].high);
memcpy(buf, &prop, size);
trace_virtio_iommu_fill_resv_property(ep, prop.subtype,
prop.start, prop.end);
buf += size;
}
return total;
}
/**
* virtio_iommu_probe - Fill the probe request buffer with
* the properties the device is able to return
*/
static int virtio_iommu_probe(VirtIOIOMMU *s,
struct virtio_iommu_req_probe *req,
uint8_t *buf)
{
uint32_t ep_id = le32_to_cpu(req->endpoint);
size_t free = VIOMMU_PROBE_SIZE;
ssize_t count;
if (!virtio_iommu_mr(s, ep_id)) {
return VIRTIO_IOMMU_S_NOENT;
}
count = virtio_iommu_fill_resv_mem_prop(s, ep_id, buf, free);
if (count < 0) {
return VIRTIO_IOMMU_S_INVAL;
}
buf += count;
free -= count;
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_iov_to_req(struct iovec *iov,
unsigned int iov_cnt,
void *req, size_t req_sz)
{
size_t sz, payload_sz = req_sz - sizeof(struct virtio_iommu_req_tail);
sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz);
if (unlikely(sz != payload_sz)) {
return VIRTIO_IOMMU_S_INVAL;
}
return 0;
}
#define virtio_iommu_handle_req(__req) \
static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, \
struct iovec *iov, \
unsigned int iov_cnt) \
{ \
struct virtio_iommu_req_ ## __req req; \
int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req)); \
\
return ret ? ret : virtio_iommu_ ## __req(s, &req); \
}
virtio_iommu_handle_req(attach)
virtio_iommu_handle_req(detach)
virtio_iommu_handle_req(map)
virtio_iommu_handle_req(unmap)
static int virtio_iommu_handle_probe(VirtIOIOMMU *s,
struct iovec *iov,
unsigned int iov_cnt,
uint8_t *buf)
{
struct virtio_iommu_req_probe req;
int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req));
return ret ? ret : virtio_iommu_probe(s, &req, buf);
}
static void virtio_iommu_handle_command(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
struct virtio_iommu_req_head head;
struct virtio_iommu_req_tail tail = {};
size_t output_size = sizeof(tail), sz;
VirtQueueElement *elem;
unsigned int iov_cnt;
struct iovec *iov;
void *buf = NULL;
for (;;) {
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
return;
}
if (iov_size(elem->in_sg, elem->in_num) < sizeof(tail) ||
iov_size(elem->out_sg, elem->out_num) < sizeof(head)) {
virtio_error(vdev, "virtio-iommu bad head/tail size");
virtqueue_detach_element(vq, elem, 0);
g_free(elem);
break;
}
iov_cnt = elem->out_num;
iov = elem->out_sg;
sz = iov_to_buf(iov, iov_cnt, 0, &head, sizeof(head));
if (unlikely(sz != sizeof(head))) {
tail.status = VIRTIO_IOMMU_S_DEVERR;
goto out;
}
qemu_mutex_lock(&s->mutex);
switch (head.type) {
case VIRTIO_IOMMU_T_ATTACH:
tail.status = virtio_iommu_handle_attach(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_DETACH:
tail.status = virtio_iommu_handle_detach(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_MAP:
tail.status = virtio_iommu_handle_map(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_UNMAP:
tail.status = virtio_iommu_handle_unmap(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_PROBE:
{
struct virtio_iommu_req_tail *ptail;
output_size = s->config.probe_size + sizeof(tail);
buf = g_malloc0(output_size);
ptail = (struct virtio_iommu_req_tail *)
(buf + s->config.probe_size);
ptail->status = virtio_iommu_handle_probe(s, iov, iov_cnt, buf);
break;
}
default:
tail.status = VIRTIO_IOMMU_S_UNSUPP;
}
qemu_mutex_unlock(&s->mutex);
out:
sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
buf ? buf : &tail, output_size);
assert(sz == output_size);
virtqueue_push(vq, elem, sz);
virtio_notify(vdev, vq);
g_free(elem);
g_free(buf);
}
}
static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason,
int flags, uint32_t endpoint,
uint64_t address)
{
VirtIODevice *vdev = &viommu->parent_obj;
VirtQueue *vq = viommu->event_vq;
struct virtio_iommu_fault fault;
VirtQueueElement *elem;
size_t sz;
memset(&fault, 0, sizeof(fault));
fault.reason = reason;
fault.flags = cpu_to_le32(flags);
fault.endpoint = cpu_to_le32(endpoint);
fault.address = cpu_to_le64(address);
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
error_report_once(
"no buffer available in event queue to report event");
return;
}
if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) {
virtio_error(vdev, "error buffer of wrong size");
virtqueue_detach_element(vq, elem, 0);
g_free(elem);
return;
}
sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
&fault, sizeof(fault));
assert(sz == sizeof(fault));
trace_virtio_iommu_report_fault(reason, flags, endpoint, address);
virtqueue_push(vq, elem, sz);
virtio_notify(vdev, vq);
g_free(elem);
}
static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
IOMMUAccessFlags flag,
int iommu_idx)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
VirtIOIOMMUInterval interval, *mapping_key;
VirtIOIOMMUMapping *mapping_value;
VirtIOIOMMU *s = sdev->viommu;
bool read_fault, write_fault;
VirtIOIOMMUEndpoint *ep;
uint32_t sid, flags;
bool bypass_allowed;
bool found;
int i;
interval.low = addr;
interval.high = addr + 1;
IOMMUTLBEntry entry = {
.target_as = &address_space_memory,
.iova = addr,
.translated_addr = addr,
.addr_mask = (1 << ctz32(s->config.page_size_mask)) - 1,
.perm = IOMMU_NONE,
};
bypass_allowed = virtio_vdev_has_feature(&s->parent_obj,
VIRTIO_IOMMU_F_BYPASS);
sid = virtio_iommu_get_bdf(sdev);
trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag);
qemu_mutex_lock(&s->mutex);
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
if (!ep) {
if (!bypass_allowed) {
error_report_once("%s sid=%d is not known!!", __func__, sid);
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
} else {
entry.perm = flag;
}
goto unlock;
}
for (i = 0; i < s->nb_reserved_regions; i++) {
ReservedRegion *reg = &s->reserved_regions[i];
if (addr >= reg->low && addr <= reg->high) {
switch (reg->type) {
case VIRTIO_IOMMU_RESV_MEM_T_MSI:
entry.perm = flag;
break;
case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
default:
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
break;
}
goto unlock;
}
}
if (!ep->domain) {
if (!bypass_allowed) {
error_report_once("%s %02x:%02x.%01x not attached to any domain",
__func__, PCI_BUS_NUM(sid),
PCI_SLOT(sid), PCI_FUNC(sid));
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
} else {
entry.perm = flag;
}
goto unlock;
}
found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval),
(void **)&mapping_key,
(void **)&mapping_value);
if (!found) {
error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d",
__func__, addr, sid);
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
goto unlock;
}
read_fault = (flag & IOMMU_RO) &&
!(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ);
write_fault = (flag & IOMMU_WO) &&
!(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE);
flags = read_fault ? VIRTIO_IOMMU_FAULT_F_READ : 0;
flags |= write_fault ? VIRTIO_IOMMU_FAULT_F_WRITE : 0;
if (flags) {
error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d",
__func__, addr, flag, mapping_value->flags);
flags |= VIRTIO_IOMMU_FAULT_F_ADDRESS;
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
flags | VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
goto unlock;
}
entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr;
entry.perm = flag;
trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid);
unlock:
qemu_mutex_unlock(&s->mutex);
return entry;
}
static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data)
{
VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
struct virtio_iommu_config *config = &dev->config;
trace_virtio_iommu_get_config(config->page_size_mask,
config->input_range.start,
config->input_range.end,
config->domain_range.end,
config->probe_size);
memcpy(config_data, &dev->config, sizeof(struct virtio_iommu_config));
}
static void virtio_iommu_set_config(VirtIODevice *vdev,
const uint8_t *config_data)
{
struct virtio_iommu_config config;
memcpy(&config, config_data, sizeof(struct virtio_iommu_config));
trace_virtio_iommu_set_config(config.page_size_mask,
config.input_range.start,
config.input_range.end,
config.domain_range.end,
config.probe_size);
}
static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f,
Error **errp)
{
VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
f |= dev->features;
trace_virtio_iommu_get_features(f);
return f;
}
static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
{
guint ua = GPOINTER_TO_UINT(a);
guint ub = GPOINTER_TO_UINT(b);
return (ua > ub) - (ua < ub);
}
static gboolean virtio_iommu_remap(gpointer key, gpointer value, gpointer data)
{
VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
trace_virtio_iommu_remap(mr->parent_obj.name, interval->low, interval->high,
mapping->phys_addr);
virtio_iommu_notify_map(mr, interval->low, interval->high,
mapping->phys_addr, mapping->flags);
return false;
}
static void virtio_iommu_replay(IOMMUMemoryRegion *mr, IOMMUNotifier *n)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
VirtIOIOMMU *s = sdev->viommu;
uint32_t sid;
VirtIOIOMMUEndpoint *ep;
sid = virtio_iommu_get_bdf(sdev);
qemu_mutex_lock(&s->mutex);
if (!s->endpoints) {
goto unlock;
}
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
if (!ep || !ep->domain) {
goto unlock;
}
g_tree_foreach(ep->domain->mappings, virtio_iommu_remap, mr);
unlock:
qemu_mutex_unlock(&s->mutex);
}
static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
IOMMUNotifierFlag old,
IOMMUNotifierFlag new,
Error **errp)
{
vhost: Unbreak SMMU and virtio-iommu on dev-iotlb support Previous work on dev-iotlb message broke vhost on either SMMU or virtio-iommu since dev-iotlb (or PCIe ATS) is not yet supported for those archs. An initial idea is that we can let IOMMU to export this information to vhost so that vhost would know whether the vIOMMU would support dev-iotlb, then vhost can conditionally register to dev-iotlb or the old iotlb way. We can work based on some previous patch to introduce PCIIOMMUOps as Yi Liu proposed [1]. However it's not as easy as I thought since vhost_iommu_region_add() does not have a PCIDevice context at all since it's completely a backend. It seems non-trivial to pass over a PCI device to the backend during init. E.g. when the IOMMU notifier registered hdev->vdev is still NULL. To make the fix smaller and easier, this patch goes the other way to leverage the flag_changed() hook of vIOMMUs so that SMMU and virtio-iommu can trap the dev-iotlb registration and fail it. Then vhost could try the fallback solution as using UNMAP invalidation for it's translations. [1] https://lore.kernel.org/qemu-devel/1599735398-6829-4-git-send-email-yi.l.liu@intel.com/ Reported-by: Eric Auger <eric.auger@redhat.com> Fixes: b68ba1ca57677acf870d5ab10579e6105c1f5338 Reviewed-by: Eric Auger <eric.auger@redhat.com> Tested-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20210204191228.187550-1-peterx@redhat.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2021-02-05 03:12:28 +08:00
if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
error_setg(errp, "Virtio-iommu does not support dev-iotlb yet");
return -EINVAL;
}
if (old == IOMMU_NOTIFIER_NONE) {
trace_virtio_iommu_notify_flag_add(iommu_mr->parent_obj.name);
} else if (new == IOMMU_NOTIFIER_NONE) {
trace_virtio_iommu_notify_flag_del(iommu_mr->parent_obj.name);
}
return 0;
}
/*
* The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule,
* for example 0xfffffffffffff000. When an assigned device has page size
* restrictions due to the hardware IOMMU configuration, apply this restriction
* to the mask.
*/
static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
uint64_t new_mask,
Error **errp)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
VirtIOIOMMU *s = sdev->viommu;
uint64_t cur_mask = s->config.page_size_mask;
trace_virtio_iommu_set_page_size_mask(mr->parent_obj.name, cur_mask,
new_mask);
if ((cur_mask & new_mask) == 0) {
error_setg(errp, "virtio-iommu page mask 0x%"PRIx64
" is incompatible with mask 0x%"PRIx64, cur_mask, new_mask);
return -1;
}
/*
* After the machine is finalized, we can't change the mask anymore. If by
* chance the hotplugged device supports the same granule, we can still
* accept it. Having a different masks is possible but the guest will use
* sub-optimal block sizes, so warn about it.
*/
if (phase_check(PHASE_MACHINE_READY)) {
int new_granule = ctz64(new_mask);
int cur_granule = ctz64(cur_mask);
if (new_granule != cur_granule) {
error_setg(errp, "virtio-iommu page mask 0x%"PRIx64
" is incompatible with mask 0x%"PRIx64, cur_mask,
new_mask);
return -1;
} else if (new_mask != cur_mask) {
warn_report("virtio-iommu page mask 0x%"PRIx64
" does not match 0x%"PRIx64, cur_mask, new_mask);
}
return 0;
}
s->config.page_size_mask &= new_mask;
return 0;
}
static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
virtio_init(vdev, "virtio-iommu", VIRTIO_ID_IOMMU,
sizeof(struct virtio_iommu_config));
memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num));
s->req_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE,
virtio_iommu_handle_command);
s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL);
s->config.page_size_mask = TARGET_PAGE_MASK;
s->config.input_range.end = -1UL;
s->config.domain_range.end = 32;
s->config.probe_size = VIOMMU_PROBE_SIZE;
virtio_add_feature(&s->features, VIRTIO_RING_F_EVENT_IDX);
virtio_add_feature(&s->features, VIRTIO_RING_F_INDIRECT_DESC);
virtio_add_feature(&s->features, VIRTIO_F_VERSION_1);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_PROBE);
qemu_mutex_init(&s->mutex);
s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free);
if (s->primary_bus) {
pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s);
} else {
error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
}
}
qdev: Unrealize must not fail Devices may have component devices and buses. Device realization may fail. Realization is recursive: a device's realize() method realizes its components, and device_set_realized() realizes its buses (which should in turn realize the devices on that bus, except bus_set_realized() doesn't implement that, yet). When realization of a component or bus fails, we need to roll back: unrealize everything we realized so far. If any of these unrealizes failed, the device would be left in an inconsistent state. Must not happen. device_set_realized() lets it happen: it ignores errors in the roll back code starting at label child_realize_fail. Since realization is recursive, unrealization must be recursive, too. But how could a partly failed unrealize be rolled back? We'd have to re-realize, which can fail. This design is fundamentally broken. device_set_realized() does not roll back at all. Instead, it keeps unrealizing, ignoring further errors. It can screw up even for a device with no buses: if the lone dc->unrealize() fails, it still unregisters vmstate, and calls listeners' unrealize() callback. bus_set_realized() does not roll back either. Instead, it stops unrealizing. Fortunately, no unrealize method can fail, as we'll see below. To fix the design error, drop parameter @errp from all the unrealize methods. Any unrealize method that uses @errp now needs an update. This leads us to unrealize() methods that can fail. Merely passing it to another unrealize method cannot cause failure, though. Here are the ones that do other things with @errp: * virtio_serial_device_unrealize() Fails when qbus_set_hotplug_handler() fails, but still does all the other work. On failure, the device would stay realized with its resources completely gone. Oops. Can't happen, because qbus_set_hotplug_handler() can't actually fail here. Pass &error_abort to qbus_set_hotplug_handler() instead. * hw/ppc/spapr_drc.c's unrealize() Fails when object_property_del() fails, but all the other work is already done. On failure, the device would stay realized with its vmstate registration gone. Oops. Can't happen, because object_property_del() can't actually fail here. Pass &error_abort to object_property_del() instead. * spapr_phb_unrealize() Fails and bails out when remove_drcs() fails, but other work is already done. On failure, the device would stay realized with some of its resources gone. Oops. remove_drcs() fails only when chassis_from_bus()'s object_property_get_uint() fails, and it can't here. Pass &error_abort to remove_drcs() instead. Therefore, no unrealize method can fail before this patch. device_set_realized()'s recursive unrealization via bus uses object_property_set_bool(). Can't drop @errp there, so pass &error_abort. We similarly unrealize with object_property_set_bool() elsewhere, always ignoring errors. Pass &error_abort instead. Several unrealize methods no longer handle errors from other unrealize methods: virtio_9p_device_unrealize(), virtio_input_device_unrealize(), scsi_qdev_unrealize(), ... Much of the deleted error handling looks wrong anyway. One unrealize methods no longer ignore such errors: usb_ehci_pci_exit(). Several realize methods no longer ignore errors when rolling back: v9fs_device_realize_common(), pci_qdev_unrealize(), spapr_phb_realize(), usb_qdev_realize(), vfio_ccw_realize(), virtio_device_realize(). Signed-off-by: Markus Armbruster <armbru@redhat.com> Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> Message-Id: <20200505152926.18877-17-armbru@redhat.com>
2020-05-05 23:29:24 +08:00
static void virtio_iommu_device_unrealize(DeviceState *dev)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
g_hash_table_destroy(s->as_by_busptr);
if (s->domains) {
g_tree_destroy(s->domains);
}
if (s->endpoints) {
g_tree_destroy(s->endpoints);
}
virtio_delete_queue(s->req_vq);
virtio_delete_queue(s->event_vq);
virtio_cleanup(vdev);
}
static void virtio_iommu_device_reset(VirtIODevice *vdev)
{
VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
trace_virtio_iommu_device_reset();
if (s->domains) {
g_tree_destroy(s->domains);
}
if (s->endpoints) {
g_tree_destroy(s->endpoints);
}
s->domains = g_tree_new_full((GCompareDataFunc)int_cmp,
NULL, NULL, virtio_iommu_put_domain);
s->endpoints = g_tree_new_full((GCompareDataFunc)int_cmp,
NULL, NULL, virtio_iommu_put_endpoint);
}
static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status)
{
trace_virtio_iommu_device_status(status);
}
static void virtio_iommu_instance_init(Object *obj)
{
}
#define VMSTATE_INTERVAL \
{ \
.name = "interval", \
.version_id = 1, \
.minimum_version_id = 1, \
.fields = (VMStateField[]) { \
VMSTATE_UINT64(low, VirtIOIOMMUInterval), \
VMSTATE_UINT64(high, VirtIOIOMMUInterval), \
VMSTATE_END_OF_LIST() \
} \
}
#define VMSTATE_MAPPING \
{ \
.name = "mapping", \
.version_id = 1, \
.minimum_version_id = 1, \
.fields = (VMStateField[]) { \
VMSTATE_UINT64(phys_addr, VirtIOIOMMUMapping),\
VMSTATE_UINT32(flags, VirtIOIOMMUMapping), \
VMSTATE_END_OF_LIST() \
}, \
}
static const VMStateDescription vmstate_interval_mapping[2] = {
VMSTATE_MAPPING, /* value */
VMSTATE_INTERVAL /* key */
};
static int domain_preload(void *opaque)
{
VirtIOIOMMUDomain *domain = opaque;
domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
NULL, g_free, g_free);
return 0;
}
static const VMStateDescription vmstate_endpoint = {
.name = "endpoint",
.version_id = 1,
.minimum_version_id = 1,
.fields = (VMStateField[]) {
VMSTATE_UINT32(id, VirtIOIOMMUEndpoint),
VMSTATE_END_OF_LIST()
}
};
static const VMStateDescription vmstate_domain = {
.name = "domain",
.version_id = 1,
.minimum_version_id = 1,
.pre_load = domain_preload,
.fields = (VMStateField[]) {
VMSTATE_UINT32(id, VirtIOIOMMUDomain),
VMSTATE_GTREE_V(mappings, VirtIOIOMMUDomain, 1,
vmstate_interval_mapping,
VirtIOIOMMUInterval, VirtIOIOMMUMapping),
VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1,
vmstate_endpoint, VirtIOIOMMUEndpoint, next),
VMSTATE_END_OF_LIST()
}
};
static gboolean reconstruct_endpoints(gpointer key, gpointer value,
gpointer data)
{
VirtIOIOMMU *s = (VirtIOIOMMU *)data;
VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value;
VirtIOIOMMUEndpoint *iter;
IOMMUMemoryRegion *mr;
QLIST_FOREACH(iter, &d->endpoint_list, next) {
mr = virtio_iommu_mr(s, iter->id);
assert(mr);
iter->domain = d;
iter->iommu_mr = mr;
g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter);
}
return false; /* continue the domain traversal */
}
static int iommu_post_load(void *opaque, int version_id)
{
VirtIOIOMMU *s = opaque;
g_tree_foreach(s->domains, reconstruct_endpoints, s);
return 0;
}
static const VMStateDescription vmstate_virtio_iommu_device = {
.name = "virtio-iommu-device",
.minimum_version_id = 1,
.version_id = 1,
.post_load = iommu_post_load,
.fields = (VMStateField[]) {
VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 1,
&vmstate_domain, VirtIOIOMMUDomain),
VMSTATE_END_OF_LIST()
},
};
static const VMStateDescription vmstate_virtio_iommu = {
.name = "virtio-iommu",
.minimum_version_id = 1,
.priority = MIG_PRI_IOMMU,
.version_id = 1,
.fields = (VMStateField[]) {
VMSTATE_VIRTIO_DEVICE,
VMSTATE_END_OF_LIST()
},
};
static Property virtio_iommu_properties[] = {
DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, "PCI", PCIBus *),
DEFINE_PROP_END_OF_LIST(),
};
static void virtio_iommu_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
device_class_set_props(dc, virtio_iommu_properties);
dc->vmsd = &vmstate_virtio_iommu;
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
vdc->realize = virtio_iommu_device_realize;
vdc->unrealize = virtio_iommu_device_unrealize;
vdc->reset = virtio_iommu_device_reset;
vdc->get_config = virtio_iommu_get_config;
vdc->set_config = virtio_iommu_set_config;
vdc->get_features = virtio_iommu_get_features;
vdc->set_status = virtio_iommu_set_status;
vdc->vmsd = &vmstate_virtio_iommu_device;
}
static void virtio_iommu_memory_region_class_init(ObjectClass *klass,
void *data)
{
IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
imrc->translate = virtio_iommu_translate;
imrc->replay = virtio_iommu_replay;
imrc->notify_flag_changed = virtio_iommu_notify_flag_changed;
imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask;
}
static const TypeInfo virtio_iommu_info = {
.name = TYPE_VIRTIO_IOMMU,
.parent = TYPE_VIRTIO_DEVICE,
.instance_size = sizeof(VirtIOIOMMU),
.instance_init = virtio_iommu_instance_init,
.class_init = virtio_iommu_class_init,
};
static const TypeInfo virtio_iommu_memory_region_info = {
.parent = TYPE_IOMMU_MEMORY_REGION,
.name = TYPE_VIRTIO_IOMMU_MEMORY_REGION,
.class_init = virtio_iommu_memory_region_class_init,
};
static void virtio_register_types(void)
{
type_register_static(&virtio_iommu_info);
type_register_static(&virtio_iommu_memory_region_info);
}
type_init(virtio_register_types)