qemu/hw/virtio/vhost-user.c

513 lines
15 KiB
C
Raw Normal View History

/*
* vhost-user
*
* Copyright (c) 2013 Virtual Open Systems Sarl.
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "hw/virtio/vhost.h"
#include "hw/virtio/vhost-backend.h"
#include "sysemu/char.h"
#include "sysemu/kvm.h"
#include "qemu/error-report.h"
#include "qemu/sockets.h"
#include "exec/ram_addr.h"
#include "migration/migration.h"
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <linux/vhost.h>
#define VHOST_MEMORY_MAX_NREGIONS 8
#define VHOST_USER_F_PROTOCOL_FEATURES 30
#define VHOST_USER_PROTOCOL_FEATURE_MASK 0x3ULL
#define VHOST_USER_PROTOCOL_F_MQ 0
#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
typedef enum VhostUserRequest {
VHOST_USER_NONE = 0,
VHOST_USER_GET_FEATURES = 1,
VHOST_USER_SET_FEATURES = 2,
VHOST_USER_SET_OWNER = 3,
VHOST_USER_RESET_DEVICE = 4,
VHOST_USER_SET_MEM_TABLE = 5,
VHOST_USER_SET_LOG_BASE = 6,
VHOST_USER_SET_LOG_FD = 7,
VHOST_USER_SET_VRING_NUM = 8,
VHOST_USER_SET_VRING_ADDR = 9,
VHOST_USER_SET_VRING_BASE = 10,
VHOST_USER_GET_VRING_BASE = 11,
VHOST_USER_SET_VRING_KICK = 12,
VHOST_USER_SET_VRING_CALL = 13,
VHOST_USER_SET_VRING_ERR = 14,
VHOST_USER_GET_PROTOCOL_FEATURES = 15,
VHOST_USER_SET_PROTOCOL_FEATURES = 16,
VHOST_USER_GET_QUEUE_NUM = 17,
VHOST_USER_SET_VRING_ENABLE = 18,
VHOST_USER_MAX
} VhostUserRequest;
typedef struct VhostUserMemoryRegion {
uint64_t guest_phys_addr;
uint64_t memory_size;
uint64_t userspace_addr;
uint64_t mmap_offset;
} VhostUserMemoryRegion;
typedef struct VhostUserMemory {
uint32_t nregions;
uint32_t padding;
VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
} VhostUserMemory;
typedef struct VhostUserMsg {
VhostUserRequest request;
#define VHOST_USER_VERSION_MASK (0x3)
#define VHOST_USER_REPLY_MASK (0x1<<2)
uint32_t flags;
uint32_t size; /* the following payload size */
union {
#define VHOST_USER_VRING_IDX_MASK (0xff)
#define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
uint64_t u64;
struct vhost_vring_state state;
struct vhost_vring_addr addr;
VhostUserMemory memory;
};
} QEMU_PACKED VhostUserMsg;
static VhostUserMsg m __attribute__ ((unused));
#define VHOST_USER_HDR_SIZE (sizeof(m.request) \
+ sizeof(m.flags) \
+ sizeof(m.size))
#define VHOST_USER_PAYLOAD_SIZE (sizeof(m) - VHOST_USER_HDR_SIZE)
/* The version of the protocol we support */
#define VHOST_USER_VERSION (0x1)
static bool ioeventfd_enabled(void)
{
return kvm_enabled() && kvm_eventfds_enabled();
}
static unsigned long int ioctl_to_vhost_user_request[VHOST_USER_MAX] = {
-1, /* VHOST_USER_NONE */
VHOST_GET_FEATURES, /* VHOST_USER_GET_FEATURES */
VHOST_SET_FEATURES, /* VHOST_USER_SET_FEATURES */
VHOST_SET_OWNER, /* VHOST_USER_SET_OWNER */
VHOST_RESET_DEVICE, /* VHOST_USER_RESET_DEVICE */
VHOST_SET_MEM_TABLE, /* VHOST_USER_SET_MEM_TABLE */
VHOST_SET_LOG_BASE, /* VHOST_USER_SET_LOG_BASE */
VHOST_SET_LOG_FD, /* VHOST_USER_SET_LOG_FD */
VHOST_SET_VRING_NUM, /* VHOST_USER_SET_VRING_NUM */
VHOST_SET_VRING_ADDR, /* VHOST_USER_SET_VRING_ADDR */
VHOST_SET_VRING_BASE, /* VHOST_USER_SET_VRING_BASE */
VHOST_GET_VRING_BASE, /* VHOST_USER_GET_VRING_BASE */
VHOST_SET_VRING_KICK, /* VHOST_USER_SET_VRING_KICK */
VHOST_SET_VRING_CALL, /* VHOST_USER_SET_VRING_CALL */
VHOST_SET_VRING_ERR /* VHOST_USER_SET_VRING_ERR */
};
static VhostUserRequest vhost_user_request_translate(unsigned long int request)
{
VhostUserRequest idx;
for (idx = 0; idx < VHOST_USER_MAX; idx++) {
if (ioctl_to_vhost_user_request[idx] == request) {
break;
}
}
return (idx == VHOST_USER_MAX) ? VHOST_USER_NONE : idx;
}
static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
{
CharDriverState *chr = dev->opaque;
uint8_t *p = (uint8_t *) msg;
int r, size = VHOST_USER_HDR_SIZE;
r = qemu_chr_fe_read_all(chr, p, size);
if (r != size) {
error_report("Failed to read msg header. Read %d instead of %d.", r,
size);
goto fail;
}
/* validate received flags */
if (msg->flags != (VHOST_USER_REPLY_MASK | VHOST_USER_VERSION)) {
error_report("Failed to read msg header."
" Flags 0x%x instead of 0x%x.", msg->flags,
VHOST_USER_REPLY_MASK | VHOST_USER_VERSION);
goto fail;
}
/* validate message size is sane */
if (msg->size > VHOST_USER_PAYLOAD_SIZE) {
error_report("Failed to read msg header."
" Size %d exceeds the maximum %zu.", msg->size,
VHOST_USER_PAYLOAD_SIZE);
goto fail;
}
if (msg->size) {
p += VHOST_USER_HDR_SIZE;
size = msg->size;
r = qemu_chr_fe_read_all(chr, p, size);
if (r != size) {
error_report("Failed to read msg payload."
" Read %d instead of %d.", r, msg->size);
goto fail;
}
}
return 0;
fail:
return -1;
}
static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg,
int *fds, int fd_num)
{
CharDriverState *chr = dev->opaque;
int size = VHOST_USER_HDR_SIZE + msg->size;
if (fd_num) {
qemu_chr_fe_set_msgfds(chr, fds, fd_num);
}
return qemu_chr_fe_write_all(chr, (const uint8_t *) msg, size) == size ?
0 : -1;
}
vhost-user: add multiple queue support This patch is initially based a patch from Nikolay Nikolaev. This patch adds vhost-user multiple queue support, by creating a nc and vhost_net pair for each queue. Qemu exits if find that the backend can't support the number of requested queues (by providing queues=# option). The max number is queried by a new message, VHOST_USER_GET_QUEUE_NUM, and is sent only when protocol feature VHOST_USER_PROTOCOL_F_MQ is present first. The max queue check is done at vhost-user initiation stage. We initiate one queue first, which, in the meantime, also gets the max_queues the backend supports. In older version, it was reported that some messages are sent more times than necessary. Here we came an agreement with Michael that we could categorize vhost user messages to 2 types: non-vring specific messages, which should be sent only once, and vring specific messages, which should be sent per queue. Here I introduced a helper function vhost_user_one_time_request(), which lists following messages as non-vring specific messages: VHOST_USER_SET_OWNER VHOST_USER_RESET_DEVICE VHOST_USER_SET_MEM_TABLE VHOST_USER_GET_QUEUE_NUM For above messages, we simply ignore them when they are not sent the first time. Signed-off-by: Nikolay Nikolaev <n.nikolaev@virtualopensystems.com> Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Jason Wang <jasowang@redhat.com> Tested-by: Marcel Apfelbaum <marcel@redhat.com>
2015-09-23 12:20:00 +08:00
static bool vhost_user_one_time_request(VhostUserRequest request)
{
switch (request) {
case VHOST_USER_SET_OWNER:
case VHOST_USER_RESET_DEVICE:
case VHOST_USER_SET_MEM_TABLE:
case VHOST_USER_GET_QUEUE_NUM:
return true;
default:
return false;
}
}
static int vhost_user_call(struct vhost_dev *dev, unsigned long int request,
void *arg)
{
VhostUserMsg msg;
VhostUserRequest msg_request;
struct vhost_vring_file *file = 0;
int need_reply = 0;
int fds[VHOST_MEMORY_MAX_NREGIONS];
int i, fd;
size_t fd_num = 0;
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
/* only translate vhost ioctl requests */
if (request > VHOST_USER_MAX) {
msg_request = vhost_user_request_translate(request);
} else {
msg_request = request;
}
vhost-user: add multiple queue support This patch is initially based a patch from Nikolay Nikolaev. This patch adds vhost-user multiple queue support, by creating a nc and vhost_net pair for each queue. Qemu exits if find that the backend can't support the number of requested queues (by providing queues=# option). The max number is queried by a new message, VHOST_USER_GET_QUEUE_NUM, and is sent only when protocol feature VHOST_USER_PROTOCOL_F_MQ is present first. The max queue check is done at vhost-user initiation stage. We initiate one queue first, which, in the meantime, also gets the max_queues the backend supports. In older version, it was reported that some messages are sent more times than necessary. Here we came an agreement with Michael that we could categorize vhost user messages to 2 types: non-vring specific messages, which should be sent only once, and vring specific messages, which should be sent per queue. Here I introduced a helper function vhost_user_one_time_request(), which lists following messages as non-vring specific messages: VHOST_USER_SET_OWNER VHOST_USER_RESET_DEVICE VHOST_USER_SET_MEM_TABLE VHOST_USER_GET_QUEUE_NUM For above messages, we simply ignore them when they are not sent the first time. Signed-off-by: Nikolay Nikolaev <n.nikolaev@virtualopensystems.com> Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Jason Wang <jasowang@redhat.com> Tested-by: Marcel Apfelbaum <marcel@redhat.com>
2015-09-23 12:20:00 +08:00
/*
* For non-vring specific requests, like VHOST_USER_SET_MEM_TABLE,
* we just need send it once in the first time. For later such
* request, we just ignore it.
*/
if (vhost_user_one_time_request(msg_request) && dev->vq_index != 0) {
return 0;
}
msg.request = msg_request;
msg.flags = VHOST_USER_VERSION;
msg.size = 0;
switch (msg_request) {
case VHOST_USER_GET_FEATURES:
case VHOST_USER_GET_PROTOCOL_FEATURES:
case VHOST_USER_GET_QUEUE_NUM:
need_reply = 1;
break;
case VHOST_USER_SET_FEATURES:
case VHOST_USER_SET_PROTOCOL_FEATURES:
msg.u64 = *((__u64 *) arg);
msg.size = sizeof(m.u64);
break;
case VHOST_USER_SET_OWNER:
case VHOST_USER_RESET_DEVICE:
break;
case VHOST_USER_SET_MEM_TABLE:
for (i = 0; i < dev->mem->nregions; ++i) {
struct vhost_memory_region *reg = dev->mem->regions + i;
ram_addr_t ram_addr;
assert((uintptr_t)reg->userspace_addr == reg->userspace_addr);
qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr, &ram_addr);
fd = qemu_get_ram_fd(ram_addr);
if (fd > 0) {
msg.memory.regions[fd_num].userspace_addr = reg->userspace_addr;
msg.memory.regions[fd_num].memory_size = reg->memory_size;
msg.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr;
msg.memory.regions[fd_num].mmap_offset = reg->userspace_addr -
(uintptr_t) qemu_get_ram_block_host_ptr(ram_addr);
assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
fds[fd_num++] = fd;
}
}
msg.memory.nregions = fd_num;
if (!fd_num) {
error_report("Failed initializing vhost-user memory map, "
"consider using -object memory-backend-file share=on");
return -1;
}
msg.size = sizeof(m.memory.nregions);
msg.size += sizeof(m.memory.padding);
msg.size += fd_num * sizeof(VhostUserMemoryRegion);
break;
case VHOST_USER_SET_LOG_FD:
fds[fd_num++] = *((int *) arg);
break;
case VHOST_USER_SET_VRING_NUM:
case VHOST_USER_SET_VRING_BASE:
case VHOST_USER_SET_VRING_ENABLE:
memcpy(&msg.state, arg, sizeof(struct vhost_vring_state));
msg.size = sizeof(m.state);
break;
case VHOST_USER_GET_VRING_BASE:
memcpy(&msg.state, arg, sizeof(struct vhost_vring_state));
msg.size = sizeof(m.state);
need_reply = 1;
break;
case VHOST_USER_SET_VRING_ADDR:
memcpy(&msg.addr, arg, sizeof(struct vhost_vring_addr));
msg.size = sizeof(m.addr);
break;
case VHOST_USER_SET_VRING_KICK:
case VHOST_USER_SET_VRING_CALL:
case VHOST_USER_SET_VRING_ERR:
file = arg;
msg.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
msg.size = sizeof(m.u64);
if (ioeventfd_enabled() && file->fd > 0) {
fds[fd_num++] = file->fd;
} else {
msg.u64 |= VHOST_USER_VRING_NOFD_MASK;
}
break;
default:
error_report("vhost-user trying to send unhandled ioctl");
return -1;
break;
}
if (vhost_user_write(dev, &msg, fds, fd_num) < 0) {
return 0;
}
if (need_reply) {
if (vhost_user_read(dev, &msg) < 0) {
return 0;
}
if (msg_request != msg.request) {
error_report("Received unexpected msg type."
" Expected %d received %d", msg_request, msg.request);
return -1;
}
switch (msg_request) {
case VHOST_USER_GET_FEATURES:
case VHOST_USER_GET_PROTOCOL_FEATURES:
case VHOST_USER_GET_QUEUE_NUM:
if (msg.size != sizeof(m.u64)) {
error_report("Received bad msg size.");
return -1;
}
*((__u64 *) arg) = msg.u64;
break;
case VHOST_USER_GET_VRING_BASE:
if (msg.size != sizeof(m.state)) {
error_report("Received bad msg size.");
return -1;
}
memcpy(arg, &msg.state, sizeof(struct vhost_vring_state));
break;
default:
error_report("Received unexpected msg type.");
return -1;
break;
}
}
return 0;
}
static int vhost_set_log_base(struct vhost_dev *dev, uint64_t base,
struct vhost_log *log)
{
int fds[VHOST_MEMORY_MAX_NREGIONS];
size_t fd_num = 0;
bool shmfd = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_LOG_SHMFD);
VhostUserMsg msg = {
.request = VHOST_USER_SET_LOG_BASE,
.flags = VHOST_USER_VERSION,
.u64 = base,
.size = sizeof(m.u64),
};
if (shmfd && log->fd != -1) {
fds[fd_num++] = log->fd;
}
vhost_user_write(dev, &msg, fds, fd_num);
if (shmfd) {
msg.size = 0;
if (vhost_user_read(dev, &msg) < 0) {
return 0;
}
if (msg.request != VHOST_USER_SET_LOG_BASE) {
error_report("Received unexpected msg type. "
"Expected %d received %d",
VHOST_USER_SET_LOG_BASE, msg.request);
return -1;
}
}
return 0;
}
static int vhost_user_init(struct vhost_dev *dev, void *opaque)
{
unsigned long long features;
int err;
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
dev->opaque = opaque;
err = vhost_user_call(dev, VHOST_USER_GET_FEATURES, &features);
if (err < 0) {
return err;
}
if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) {
dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
err = vhost_user_call(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &features);
if (err < 0) {
return err;
}
dev->protocol_features = features & VHOST_USER_PROTOCOL_FEATURE_MASK;
err = vhost_user_call(dev, VHOST_USER_SET_PROTOCOL_FEATURES,
&dev->protocol_features);
if (err < 0) {
return err;
}
/* query the max queues we support if backend supports Multiple Queue */
if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) {
err = vhost_user_call(dev, VHOST_USER_GET_QUEUE_NUM, &dev->max_queues);
if (err < 0) {
return err;
}
}
}
if (dev->migration_blocker == NULL &&
!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_LOG_SHMFD)) {
error_setg(&dev->migration_blocker,
"Migration disabled: vhost-user backend lacks "
"VHOST_USER_PROTOCOL_F_LOG_SHMFD feature.");
}
return 0;
}
static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable)
{
struct vhost_vring_state state = {
.index = dev->vq_index,
.num = enable,
};
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
if (!(dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ))) {
return -1;
}
return vhost_user_call(dev, VHOST_USER_SET_VRING_ENABLE, &state);
}
static int vhost_user_cleanup(struct vhost_dev *dev)
{
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
dev->opaque = 0;
return 0;
}
static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx)
{
assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
return idx;
}
static int vhost_user_memslots_limit(struct vhost_dev *dev)
{
return VHOST_MEMORY_MAX_NREGIONS;
}
static bool vhost_user_requires_shm_log(struct vhost_dev *dev)
{
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
return virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_LOG_SHMFD);
}
const VhostOps user_ops = {
.backend_type = VHOST_BACKEND_TYPE_USER,
.vhost_call = vhost_user_call,
.vhost_backend_init = vhost_user_init,
.vhost_backend_cleanup = vhost_user_cleanup,
.vhost_backend_get_vq_index = vhost_user_get_vq_index,
.vhost_backend_set_vring_enable = vhost_user_set_vring_enable,
.vhost_backend_memslots_limit = vhost_user_memslots_limit,
.vhost_set_log_base = vhost_set_log_base,
.vhost_requires_shm_log = vhost_user_requires_shm_log,
};