From 751522275a800c0d1c215b386a17f537bc76cd37 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:06 +0300 Subject: [PATCH 01/15] hw/rdma: Make distinction between device init and start modes There are certain operations that are well considered as part of device configuration while others are needed only when "start" command is triggered by the guest driver. An example of device initialization step is msix_init and example of "device start" stage is the creation of a CQ completion handler thread. Driver expects such distinction - implement it. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-2-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/rdma_backend.c | 100 ++++++++++++++++++++++------ hw/rdma/rdma_backend.h | 2 + hw/rdma/rdma_backend_defs.h | 3 +- hw/rdma/vmw/pvrdma_main.c | 129 +++++++++++++++++++++--------------- 4 files changed, 157 insertions(+), 77 deletions(-) diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index e9ced6f9ef..647e16399f 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -35,6 +35,7 @@ #define VENDOR_ERR_MR_SMALL 0x208 #define THR_NAME_LEN 16 +#define THR_POLL_TO 5000 typedef struct BackendCtx { uint64_t req_id; @@ -91,35 +92,82 @@ static void *comp_handler_thread(void *arg) int rc; struct ibv_cq *ev_cq; void *ev_ctx; + int flags; + GPollFD pfds[1]; + + /* Change to non-blocking mode */ + flags = fcntl(backend_dev->channel->fd, F_GETFL); + rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK); + if (rc < 0) { + pr_dbg("Fail to change to non-blocking mode\n"); + return NULL; + } pr_dbg("Starting\n"); + pfds[0].fd = backend_dev->channel->fd; + pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR; + + backend_dev->comp_thread.is_running = true; + while (backend_dev->comp_thread.run) { - pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel); - rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx); - pr_dbg("ibv_get_cq_event=%d\n", rc); - if (unlikely(rc)) { - pr_dbg("---> ibv_get_cq_event (%d)\n", rc); - continue; + do { + rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS); + } while (!rc && backend_dev->comp_thread.run); + + if (backend_dev->comp_thread.run) { + pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel); + rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx); + pr_dbg("ibv_get_cq_event=%d\n", rc); + if (unlikely(rc)) { + pr_dbg("---> ibv_get_cq_event (%d)\n", rc); + continue; + } + + rc = ibv_req_notify_cq(ev_cq, 0); + if (unlikely(rc)) { + pr_dbg("Error %d from ibv_req_notify_cq\n", rc); + } + + poll_cq(backend_dev->rdma_dev_res, ev_cq); + + ibv_ack_cq_events(ev_cq, 1); } - - rc = ibv_req_notify_cq(ev_cq, 0); - if (unlikely(rc)) { - pr_dbg("Error %d from ibv_req_notify_cq\n", rc); - } - - poll_cq(backend_dev->rdma_dev_res, ev_cq); - - ibv_ack_cq_events(ev_cq, 1); } pr_dbg("Going down\n"); /* TODO: Post cqe for all remaining buffs that were posted */ + backend_dev->comp_thread.is_running = false; + + qemu_thread_exit(0); + return NULL; } +static void stop_comp_thread(RdmaBackendDev *backend_dev) +{ + backend_dev->comp_thread.run = false; + while (backend_dev->comp_thread.is_running) { + pr_dbg("Waiting for thread to complete\n"); + sleep(THR_POLL_TO / SCALE_US / 2); + } +} + +static void start_comp_thread(RdmaBackendDev *backend_dev) +{ + char thread_name[THR_NAME_LEN] = {0}; + + stop_comp_thread(backend_dev); + + snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s", + ibv_get_device_name(backend_dev->ib_dev)); + backend_dev->comp_thread.run = true; + qemu_thread_create(&backend_dev->comp_thread.thread, thread_name, + comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED); +} + void rdma_backend_register_comp_handler(void (*handler)(int status, unsigned int vendor_err, void *ctx)) { @@ -706,7 +754,6 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, int i; int ret = 0; int num_ibv_devices; - char thread_name[THR_NAME_LEN] = {0}; struct ibv_device **dev_list; struct ibv_port_attr port_attr; @@ -800,11 +847,8 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, pr_dbg("interface_id=0x%" PRIx64 "\n", be64_to_cpu(backend_dev->gid.global.interface_id)); - snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s", - ibv_get_device_name(backend_dev->ib_dev)); - backend_dev->comp_thread.run = true; - qemu_thread_create(&backend_dev->comp_thread.thread, thread_name, - comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED); + backend_dev->comp_thread.run = false; + backend_dev->comp_thread.is_running = false; ah_cache_init(); @@ -823,8 +867,22 @@ out: return ret; } + +void rdma_backend_start(RdmaBackendDev *backend_dev) +{ + pr_dbg("Starting rdma_backend\n"); + start_comp_thread(backend_dev); +} + +void rdma_backend_stop(RdmaBackendDev *backend_dev) +{ + pr_dbg("Stopping rdma_backend\n"); + stop_comp_thread(backend_dev); +} + void rdma_backend_fini(RdmaBackendDev *backend_dev) { + rdma_backend_stop(backend_dev); g_hash_table_destroy(ah_hash); ibv_destroy_comp_channel(backend_dev->channel); ibv_close_device(backend_dev->context); diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h index 3cd636dd88..3049a73962 100644 --- a/hw/rdma/rdma_backend.h +++ b/hw/rdma/rdma_backend.h @@ -52,6 +52,8 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, Error **errp); void rdma_backend_fini(RdmaBackendDev *backend_dev); +void rdma_backend_start(RdmaBackendDev *backend_dev); +void rdma_backend_stop(RdmaBackendDev *backend_dev); void rdma_backend_register_comp_handler(void (*handler)(int status, unsigned int vendor_err, void *ctx)); void rdma_backend_unregister_comp_handler(void); diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h index ff5cfc26eb..7404f64002 100644 --- a/hw/rdma/rdma_backend_defs.h +++ b/hw/rdma/rdma_backend_defs.h @@ -24,7 +24,8 @@ typedef struct RdmaDeviceResources RdmaDeviceResources; typedef struct RdmaBackendThread { QemuThread thread; QemuMutex mutex; - bool run; + bool run; /* Set by thread manager to let thread know it should exit */ + bool is_running; /* Set by the thread to report its status */ } RdmaBackendThread; typedef struct RdmaBackendDev { diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c index 3ed7409763..6a5073974d 100644 --- a/hw/rdma/vmw/pvrdma_main.c +++ b/hw/rdma/vmw/pvrdma_main.c @@ -286,8 +286,78 @@ static void init_ports(PVRDMADev *dev, Error **errp) } } +static void uninit_msix(PCIDevice *pdev, int used_vectors) +{ + PVRDMADev *dev = PVRDMA_DEV(pdev); + int i; + + for (i = 0; i < used_vectors; i++) { + msix_vector_unuse(pdev, i); + } + + msix_uninit(pdev, &dev->msix, &dev->msix); +} + +static int init_msix(PCIDevice *pdev, Error **errp) +{ + PVRDMADev *dev = PVRDMA_DEV(pdev); + int i; + int rc; + + rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX, + RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX, + RDMA_MSIX_PBA, 0, NULL); + + if (rc < 0) { + error_setg(errp, "Failed to initialize MSI-X"); + return rc; + } + + for (i = 0; i < RDMA_MAX_INTRS; i++) { + rc = msix_vector_use(PCI_DEVICE(dev), i); + if (rc < 0) { + error_setg(errp, "Fail mark MSI-X vector %d", i); + uninit_msix(pdev, i); + return rc; + } + } + + return 0; +} + +static void pvrdma_fini(PCIDevice *pdev) +{ + PVRDMADev *dev = PVRDMA_DEV(pdev); + + pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn)); + + pvrdma_qp_ops_fini(); + + rdma_rm_fini(&dev->rdma_dev_res); + + rdma_backend_fini(&dev->backend_dev); + + free_dsr(dev); + + if (msix_enabled(pdev)) { + uninit_msix(pdev, RDMA_MAX_INTRS); + } +} + +static void pvrdma_stop(PVRDMADev *dev) +{ + rdma_backend_stop(&dev->backend_dev); +} + +static void pvrdma_start(PVRDMADev *dev) +{ + rdma_backend_start(&dev->backend_dev); +} + static void activate_device(PVRDMADev *dev) { + pvrdma_start(dev); set_reg_val(dev, PVRDMA_REG_ERR, 0); pr_dbg("Device activated\n"); } @@ -300,7 +370,10 @@ static int unquiesce_device(PVRDMADev *dev) static int reset_device(PVRDMADev *dev) { + pvrdma_stop(dev); + pr_dbg("Device reset complete\n"); + return 0; } @@ -469,45 +542,6 @@ static void init_regs(PCIDevice *pdev) set_reg_val(dev, PVRDMA_REG_ERR, 0xFFFF); } -static void uninit_msix(PCIDevice *pdev, int used_vectors) -{ - PVRDMADev *dev = PVRDMA_DEV(pdev); - int i; - - for (i = 0; i < used_vectors; i++) { - msix_vector_unuse(pdev, i); - } - - msix_uninit(pdev, &dev->msix, &dev->msix); -} - -static int init_msix(PCIDevice *pdev, Error **errp) -{ - PVRDMADev *dev = PVRDMA_DEV(pdev); - int i; - int rc; - - rc = msix_init(pdev, RDMA_MAX_INTRS, &dev->msix, RDMA_MSIX_BAR_IDX, - RDMA_MSIX_TABLE, &dev->msix, RDMA_MSIX_BAR_IDX, - RDMA_MSIX_PBA, 0, NULL); - - if (rc < 0) { - error_setg(errp, "Failed to initialize MSI-X"); - return rc; - } - - for (i = 0; i < RDMA_MAX_INTRS; i++) { - rc = msix_vector_use(PCI_DEVICE(dev), i); - if (rc < 0) { - error_setg(errp, "Fail mark MSI-X vercor %d", i); - uninit_msix(pdev, i); - return rc; - } - } - - return 0; -} - static void init_dev_caps(PVRDMADev *dev) { size_t pg_tbl_bytes = TARGET_PAGE_SIZE * @@ -602,22 +636,7 @@ out: static void pvrdma_exit(PCIDevice *pdev) { - PVRDMADev *dev = PVRDMA_DEV(pdev); - - pr_dbg("Closing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn), - PCI_FUNC(pdev->devfn)); - - pvrdma_qp_ops_fini(); - - rdma_rm_fini(&dev->rdma_dev_res); - - rdma_backend_fini(&dev->backend_dev); - - free_dsr(dev); - - if (msix_enabled(pdev)) { - uninit_msix(pdev, RDMA_MAX_INTRS); - } + pvrdma_fini(pdev); } static void pvrdma_class_init(ObjectClass *klass, void *data) From b462a1348031e4731b25abd615529f56cc524bc1 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:07 +0300 Subject: [PATCH 02/15] hw/pvrdma: Bugfix - provide the correct attr_mask to query_qp Calling rdma_rm_query_qp with attr_mask equals to -1 leads to error where backend query_qp fails to retrieve the needed QP attributes. Fix it by providing the attr_mask we got from driver. Signed-off-by: Yuval Shaia Message-Id: <20180805153518.2983-3-yuval.shaia@oracle.com> Reviewed-by: Marcel Apfelbaum Signed-off-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma_cmd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c index 14255d609f..e7d6589cdc 100644 --- a/hw/rdma/vmw/pvrdma_cmd.c +++ b/hw/rdma/vmw/pvrdma_cmd.c @@ -524,6 +524,7 @@ static int query_qp(PVRDMADev *dev, union pvrdma_cmd_req *req, struct ibv_qp_init_attr init_attr; pr_dbg("qp_handle=%d\n", cmd->qp_handle); + pr_dbg("attr_mask=0x%x\n", cmd->attr_mask); memset(rsp, 0, sizeof(*rsp)); rsp->hdr.response = cmd->hdr.response; @@ -531,8 +532,8 @@ static int query_qp(PVRDMADev *dev, union pvrdma_cmd_req *req, rsp->hdr.err = rdma_rm_query_qp(&dev->rdma_dev_res, &dev->backend_dev, cmd->qp_handle, - (struct ibv_qp_attr *)&resp->attrs, -1, - &init_attr); + (struct ibv_qp_attr *)&resp->attrs, + cmd->attr_mask, &init_attr); pr_dbg("ret=%d\n", rsp->hdr.err); return rsp->hdr.err; From ef846e026c30c39239500d0b91b26d4fecc1f1d0 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:08 +0300 Subject: [PATCH 03/15] hw/rdma: Modify debug macros - Add line counter to ease navigation in log - Print rdma instead of pvrdma Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-4-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/rdma_utils.c | 4 ++++ hw/rdma/rdma_utils.h | 16 ++++++++++++---- hw/rdma/vmw/pvrdma_main.c | 2 ++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/hw/rdma/rdma_utils.c b/hw/rdma/rdma_utils.c index d713f635f1..dc23f158f3 100644 --- a/hw/rdma/rdma_utils.c +++ b/hw/rdma/rdma_utils.c @@ -15,6 +15,10 @@ #include "rdma_utils.h" +#ifdef PVRDMA_DEBUG +unsigned long pr_dbg_cnt; +#endif + void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t plen) { void *p; diff --git a/hw/rdma/rdma_utils.h b/hw/rdma/rdma_utils.h index 3dc07891bc..04c7c2ef5b 100644 --- a/hw/rdma/rdma_utils.h +++ b/hw/rdma/rdma_utils.h @@ -22,18 +22,26 @@ #include "sysemu/dma.h" #define pr_info(fmt, ...) \ - fprintf(stdout, "%s: %-20s (%3d): " fmt, "pvrdma", __func__, __LINE__,\ + fprintf(stdout, "%s: %-20s (%3d): " fmt, "rdma", __func__, __LINE__,\ ## __VA_ARGS__) #define pr_err(fmt, ...) \ - fprintf(stderr, "%s: Error at %-20s (%3d): " fmt, "pvrdma", __func__, \ + fprintf(stderr, "%s: Error at %-20s (%3d): " fmt, "rdma", __func__, \ __LINE__, ## __VA_ARGS__) #ifdef PVRDMA_DEBUG +extern unsigned long pr_dbg_cnt; + +#define init_pr_dbg(void) \ +{ \ + pr_dbg_cnt = 0; \ +} + #define pr_dbg(fmt, ...) \ - fprintf(stdout, "%s: %-20s (%3d): " fmt, "pvrdma", __func__, __LINE__,\ - ## __VA_ARGS__) + fprintf(stdout, "%lx %ld: %-20s (%3d): " fmt, pthread_self(), pr_dbg_cnt++, \ + __func__, __LINE__, ## __VA_ARGS__) #else +#define init_pr_dbg(void) #define pr_dbg(fmt, ...) #endif diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c index 6a5073974d..1b1330e113 100644 --- a/hw/rdma/vmw/pvrdma_main.c +++ b/hw/rdma/vmw/pvrdma_main.c @@ -577,6 +577,8 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp) Object *memdev_root; bool ram_shared = false; + init_pr_dbg(); + pr_dbg("Initializing device %s %x.%x\n", pdev->name, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); From eca0f2a6be86602a82edcd90ecd8f4a57b0c8007 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:09 +0300 Subject: [PATCH 04/15] hw/pvrdma: Clean CQE before use Next CQE is fetched from CQ ring, clean it before usage as it still carries old CQE values. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-5-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma_qp_ops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c index 99bb51111e..a8664f40c8 100644 --- a/hw/rdma/vmw/pvrdma_qp_ops.c +++ b/hw/rdma/vmw/pvrdma_qp_ops.c @@ -69,6 +69,7 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle, return -EINVAL; } + memset(cqe1, 0, sizeof(*cqe1)); cqe1->wr_id = cqe->wr_id; cqe1->qp = cqe->qp; cqe1->opcode = cqe->opcode; From 6e7dba23af1a8f30eac42022cfb94d8693ed20e2 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:10 +0300 Subject: [PATCH 05/15] hw/pvrdma: Make default pkey 0xFFFF 0x7FFF is not the default pkey - fix it. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-6-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma.h | 3 +++ hw/rdma/vmw/pvrdma_cmd.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h index 81e0e0e99c..e2d9f93cdf 100644 --- a/hw/rdma/vmw/pvrdma.h +++ b/hw/rdma/vmw/pvrdma.h @@ -50,6 +50,9 @@ #define PVRDMA_HW_VERSION 17 #define PVRDMA_FW_VERSION 14 +/* Some defaults */ +#define PVRDMA_PKEY 0x7FFF + typedef struct DSRInfo { dma_addr_t dma; struct pvrdma_device_shared_region *dsr; diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c index e7d6589cdc..4f74b29936 100644 --- a/hw/rdma/vmw/pvrdma_cmd.c +++ b/hw/rdma/vmw/pvrdma_cmd.c @@ -166,7 +166,7 @@ static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req, resp->hdr.ack = PVRDMA_CMD_QUERY_PKEY_RESP; resp->hdr.err = 0; - resp->pkey = 0x7FFF; + resp->pkey = PVRDMA_PKEY; pr_dbg("pkey=0x%x\n", resp->pkey); return 0; From 7f99daadbddfee16f3ee77c413bd4a34366a3dad Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:11 +0300 Subject: [PATCH 06/15] hw/rdma: Delete useless structure RdmaRmUserMR The structure RdmaRmUserMR has no benefits, remove it an move all its fields to struct RdmaRmMR. Reviewed-by: Marcel Apfelbaum Signed-off-by: Yuval Shaia Message-Id: <20180805153518.2983-7-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/rdma_backend.c | 3 +-- hw/rdma/rdma_rm.c | 16 ++++++++-------- hw/rdma/rdma_rm_defs.h | 10 +++------- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index 647e16399f..52981d652d 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -271,8 +271,7 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res, return VENDOR_ERR_INVLKEY | ssge[ssge_idx].lkey; } - dsge->addr = (uintptr_t)mr->user_mr.host_virt + ssge[ssge_idx].addr - - mr->user_mr.guest_start; + dsge->addr = (uintptr_t)mr->virt + ssge[ssge_idx].addr - mr->start; dsge->length = ssge[ssge_idx].length; dsge->lkey = rdma_backend_mr_lkey(&mr->backend_mr); diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index 415da15efe..7403d24674 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -165,15 +165,15 @@ int rdma_rm_alloc_mr(RdmaDeviceResources *dev_res, uint32_t pd_handle, length = TARGET_PAGE_SIZE; addr = g_malloc(length); } else { - mr->user_mr.host_virt = host_virt; - pr_dbg("host_virt=0x%p\n", mr->user_mr.host_virt); - mr->user_mr.length = guest_length; + mr->virt = host_virt; + pr_dbg("host_virt=0x%p\n", mr->virt); + mr->length = guest_length; pr_dbg("length=%zu\n", guest_length); - mr->user_mr.guest_start = guest_start; - pr_dbg("guest_start=0x%" PRIx64 "\n", mr->user_mr.guest_start); + mr->start = guest_start; + pr_dbg("guest_start=0x%" PRIx64 "\n", mr->start); - length = mr->user_mr.length; - addr = mr->user_mr.host_virt; + length = mr->length; + addr = mr->virt; } ret = rdma_backend_create_mr(&mr->backend_mr, &pd->backend_pd, addr, length, @@ -214,7 +214,7 @@ void rdma_rm_dealloc_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle) if (mr) { rdma_backend_destroy_mr(&mr->backend_mr); - munmap(mr->user_mr.host_virt, mr->user_mr.length); + munmap(mr->virt, mr->length); res_tbl_dealloc(&dev_res->mr_tbl, mr_handle); } } diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h index 226011176d..7228151239 100644 --- a/hw/rdma/rdma_rm_defs.h +++ b/hw/rdma/rdma_rm_defs.h @@ -55,16 +55,12 @@ typedef struct RdmaRmCQ { bool notify; } RdmaRmCQ; -typedef struct RdmaRmUserMR { - void *host_virt; - uint64_t guest_start; - size_t length; -} RdmaRmUserMR; - /* MR (DMA region) */ typedef struct RdmaRmMR { RdmaBackendMR backend_mr; - RdmaRmUserMR user_mr; + void *virt; + uint64_t start; + size_t length; uint32_t pd_handle; uint32_t lkey; uint32_t rkey; From 4c408c9d5b3f648a2f100edf6287a7c7ce63f634 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:12 +0300 Subject: [PATCH 07/15] hw/rdma: Do not allocate memory for non-dma MR There is no use in the memory allocated for non-dma MR. Delete the code that allocates it. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-8-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/rdma_rm.c | 52 +++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index 7403d24674..bf4a5c71b4 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -144,8 +144,6 @@ int rdma_rm_alloc_mr(RdmaDeviceResources *dev_res, uint32_t pd_handle, RdmaRmMR *mr; int ret = 0; RdmaRmPD *pd; - void *addr; - size_t length; pd = rdma_rm_get_pd(dev_res, pd_handle); if (!pd) { @@ -158,40 +156,29 @@ int rdma_rm_alloc_mr(RdmaDeviceResources *dev_res, uint32_t pd_handle, pr_dbg("Failed to allocate obj in table\n"); return -ENOMEM; } + pr_dbg("mr_handle=%d\n", *mr_handle); - if (!host_virt) { - /* TODO: This is my guess but not so sure that this needs to be - * done */ - length = TARGET_PAGE_SIZE; - addr = g_malloc(length); - } else { + pr_dbg("host_virt=0x%p\n", host_virt); + pr_dbg("guest_start=0x%" PRIx64 "\n", guest_start); + pr_dbg("length=%zu\n", guest_length); + + if (host_virt) { mr->virt = host_virt; - pr_dbg("host_virt=0x%p\n", mr->virt); - mr->length = guest_length; - pr_dbg("length=%zu\n", guest_length); mr->start = guest_start; - pr_dbg("guest_start=0x%" PRIx64 "\n", mr->start); + mr->length = guest_length; - length = mr->length; - addr = mr->virt; + ret = rdma_backend_create_mr(&mr->backend_mr, &pd->backend_pd, mr->virt, + mr->length, access_flags); + if (ret) { + pr_dbg("Fail in rdma_backend_create_mr, err=%d\n", ret); + ret = -EIO; + goto out_dealloc_mr; + } } - ret = rdma_backend_create_mr(&mr->backend_mr, &pd->backend_pd, addr, length, - access_flags); - if (ret) { - pr_dbg("Fail in rdma_backend_create_mr, err=%d\n", ret); - ret = -EIO; - goto out_dealloc_mr; - } - - if (!host_virt) { - *lkey = mr->lkey = rdma_backend_mr_lkey(&mr->backend_mr); - *rkey = mr->rkey = rdma_backend_mr_rkey(&mr->backend_mr); - } else { - /* We keep mr_handle in lkey so send and recv get get mr ptr */ - *lkey = *mr_handle; - *rkey = -1; - } + /* We keep mr_handle in lkey so send and recv get get mr ptr */ + *lkey = *mr_handle; + *rkey = -1; mr->pd_handle = pd_handle; @@ -214,7 +201,10 @@ void rdma_rm_dealloc_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle) if (mr) { rdma_backend_destroy_mr(&mr->backend_mr); - munmap(mr->virt, mr->length); + pr_dbg("start=0x%" PRIx64 "\n", mr->start); + if (mr->start) { + munmap(mr->virt, mr->length); + } res_tbl_dealloc(&dev_res->mr_tbl, mr_handle); } } From 7131c4b028b09c8545a16614b38aefdd4a77b0b4 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:13 +0300 Subject: [PATCH 08/15] hw/rdma: Reorder resource cleanup To be consistence with allocation do the reverse order in deallocation Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-9-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/rdma_rm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index bf4a5c71b4..1f014b4ab2 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -543,8 +543,9 @@ void rdma_rm_fini(RdmaDeviceResources *dev_res) res_tbl_free(&dev_res->uc_tbl); res_tbl_free(&dev_res->cqe_ctx_tbl); res_tbl_free(&dev_res->qp_tbl); - res_tbl_free(&dev_res->cq_tbl); res_tbl_free(&dev_res->mr_tbl); + res_tbl_free(&dev_res->cq_tbl); res_tbl_free(&dev_res->pd_tbl); + g_hash_table_destroy(dev_res->qp_hash); } From 67b32fe250a0a9759e4ead14bdcc1d992dd9bf44 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:14 +0300 Subject: [PATCH 09/15] hw/pvrdma: Cosmetic change - indent right Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-10-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/vmw/pvrdma_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c index 1b1330e113..3d448bffc4 100644 --- a/hw/rdma/vmw/pvrdma_main.c +++ b/hw/rdma/vmw/pvrdma_main.c @@ -430,7 +430,7 @@ static void regs_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) reset_device(dev); break; } - break; + break; case PVRDMA_REG_IMR: pr_dbg("Interrupt mask=0x%" PRIx64 "\n", val); dev->interrupt_mask = val; @@ -439,7 +439,7 @@ static void regs_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) if (val == 0) { execute_command(dev); } - break; + break; default: break; } From 292dce627b51329212130470f485b7dc2b72b48c Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:15 +0300 Subject: [PATCH 10/15] hw/rdma: Cosmetic change - move to generic function To ease maintenance of struct comp_thread move all related code to dedicated function. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-11-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/rdma_backend.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index 52981d652d..d29acc505b 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -146,10 +146,10 @@ static void *comp_handler_thread(void *arg) return NULL; } -static void stop_comp_thread(RdmaBackendDev *backend_dev) +static void stop_backend_thread(RdmaBackendThread *thread) { - backend_dev->comp_thread.run = false; - while (backend_dev->comp_thread.is_running) { + thread->run = false; + while (thread->is_running) { pr_dbg("Waiting for thread to complete\n"); sleep(THR_POLL_TO / SCALE_US / 2); } @@ -159,7 +159,7 @@ static void start_comp_thread(RdmaBackendDev *backend_dev) { char thread_name[THR_NAME_LEN] = {0}; - stop_comp_thread(backend_dev); + stop_backend_thread(&backend_dev->comp_thread); snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s", ibv_get_device_name(backend_dev->ib_dev)); @@ -876,7 +876,7 @@ void rdma_backend_start(RdmaBackendDev *backend_dev) void rdma_backend_stop(RdmaBackendDev *backend_dev) { pr_dbg("Stopping rdma_backend\n"); - stop_comp_thread(backend_dev); + stop_backend_thread(&backend_dev->comp_thread); } void rdma_backend_fini(RdmaBackendDev *backend_dev) From abc665aae30a6fa3b6bb6784b535ab656461abd3 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:16 +0300 Subject: [PATCH 11/15] hw/rdma: Print backend QP number in hex format To be consistent with other prints throughout the code fix places that print it as decimal number. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-12-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/rdma_rm.c | 4 ++-- hw/rdma/vmw/pvrdma_qp_ops.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index 1f014b4ab2..859c900003 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -389,7 +389,7 @@ int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, RdmaRmQP *qp; int ret; - pr_dbg("qpn=%d\n", qp_handle); + pr_dbg("qpn=0x%x\n", qp_handle); qp = rdma_rm_get_qp(dev_res, qp_handle); if (!qp) { @@ -447,7 +447,7 @@ int rdma_rm_query_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, { RdmaRmQP *qp; - pr_dbg("qpn=%d\n", qp_handle); + pr_dbg("qpn=0x%x\n", qp_handle); qp = rdma_rm_get_qp(dev_res, qp_handle); if (!qp) { diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c index a8664f40c8..c668afd0ed 100644 --- a/hw/rdma/vmw/pvrdma_qp_ops.c +++ b/hw/rdma/vmw/pvrdma_qp_ops.c @@ -130,7 +130,7 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) PvrdmaSqWqe *wqe; PvrdmaRing *ring; - pr_dbg("qp_handle=%d\n", qp_handle); + pr_dbg("qp_handle=0x%x\n", qp_handle); qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); if (unlikely(!qp)) { @@ -174,7 +174,7 @@ int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) PvrdmaRqWqe *wqe; PvrdmaRing *ring; - pr_dbg("qp_handle=%d\n", qp_handle); + pr_dbg("qp_handle=0x%x\n", qp_handle); qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); if (unlikely(!qp)) { From 7d2ce4b0169e6bd2bb1b87eb69f1fae96c854ffc Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:17 +0300 Subject: [PATCH 12/15] hw/rdma: Bugfix - Support non-aligned buffers RDMA application can provide non-aligned buffers to be registered. In such case the DMA address passed by driver is pointing to the beginning of the physical address of the mapped page so we can't distinguish between two addresses from the same page. Fix it by keeping the offset of the virtual address in mr->virt. Signed-off-by: Yuval Shaia Reviewed-by: Marcel Apfelbaum Message-Id: <20180805153518.2983-13-yuval.shaia@oracle.com> Signed-off-by: Marcel Apfelbaum --- hw/rdma/rdma_rm.c | 2 ++ hw/rdma/vmw/pvrdma_cmd.c | 1 + 2 files changed, 3 insertions(+) diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index 859c900003..8d59a42cd1 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -166,6 +166,7 @@ int rdma_rm_alloc_mr(RdmaDeviceResources *dev_res, uint32_t pd_handle, mr->virt = host_virt; mr->start = guest_start; mr->length = guest_length; + mr->virt += (mr->start & (TARGET_PAGE_SIZE - 1)); ret = rdma_backend_create_mr(&mr->backend_mr, &pd->backend_pd, mr->virt, mr->length, access_flags); @@ -203,6 +204,7 @@ void rdma_rm_dealloc_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle) rdma_backend_destroy_mr(&mr->backend_mr); pr_dbg("start=0x%" PRIx64 "\n", mr->start); if (mr->start) { + mr->virt -= (mr->start & (TARGET_PAGE_SIZE - 1)); munmap(mr->virt, mr->length); } res_tbl_dealloc(&dev_res->mr_tbl, mr_handle); diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c index 4f74b29936..b74b0b48d0 100644 --- a/hw/rdma/vmw/pvrdma_cmd.c +++ b/hw/rdma/vmw/pvrdma_cmd.c @@ -59,6 +59,7 @@ static void *pvrdma_map_to_pdir(PCIDevice *pdev, uint64_t pdir_dma, } host_virt = mremap(curr_page, 0, length, MREMAP_MAYMOVE); + pr_dbg("mremap %p -> %p\n", curr_page, host_virt); if (host_virt == MAP_FAILED) { host_virt = NULL; error_report("PVRDMA: Failed to remap memory for host_virt"); From 430e440c59e8a6bcf8806576f37470641298af31 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 5 Aug 2018 18:35:18 +0300 Subject: [PATCH 13/15] hw/rdma: Add reference to pci_dev in backend_dev The field backend_dev->dev is not initialized, fix it. Signed-off-by: Yuval Shaia Message-Id: <20180805153518.2983-14-yuval.shaia@oracle.com> Reviewed-by: Marcel Apfelbaum Signed-off-by: Marcel Apfelbaum --- hw/rdma/rdma_backend.c | 6 +++++- hw/rdma/rdma_backend.h | 2 +- hw/rdma/vmw/pvrdma_main.c | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index d29acc505b..d7a4bbd91f 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -744,7 +744,7 @@ static int init_device_caps(RdmaBackendDev *backend_dev, return 0; } -int rdma_backend_init(RdmaBackendDev *backend_dev, +int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, RdmaDeviceResources *rdma_dev_res, const char *backend_device_name, uint8_t port_num, uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, @@ -756,6 +756,10 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, struct ibv_device **dev_list; struct ibv_port_attr port_attr; + memset(backend_dev, 0, sizeof(*backend_dev)); + + backend_dev->dev = pdev; + backend_dev->backend_gid_idx = backend_gid_idx; backend_dev->port_num = port_num; backend_dev->rdma_dev_res = rdma_dev_res; diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h index 3049a73962..86e8fe8ab6 100644 --- a/hw/rdma/rdma_backend.h +++ b/hw/rdma/rdma_backend.h @@ -46,7 +46,7 @@ static inline uint32_t rdma_backend_mr_rkey(const RdmaBackendMR *mr) return mr->ibmr ? mr->ibmr->rkey : 0; } -int rdma_backend_init(RdmaBackendDev *backend_dev, +int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, RdmaDeviceResources *rdma_dev_res, const char *backend_device_name, uint8_t port_num, uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr, diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c index 3d448bffc4..ca5fa8d981 100644 --- a/hw/rdma/vmw/pvrdma_main.c +++ b/hw/rdma/vmw/pvrdma_main.c @@ -611,7 +611,7 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp) goto out; } - rc = rdma_backend_init(&dev->backend_dev, &dev->rdma_dev_res, + rc = rdma_backend_init(&dev->backend_dev, pdev, &dev->rdma_dev_res, dev->backend_device_name, dev->backend_port_num, dev->backend_gid_idx, &dev->dev_attr, errp); if (rc) { From 72221d03b85fcfba27a47a2af08ee59150550ced Mon Sep 17 00:00:00 2001 From: Marcel Apfelbaum Date: Sat, 11 Aug 2018 20:15:34 +0300 Subject: [PATCH 14/15] hw/pvrdma: remove not needed include No need to include linux/types.h, is empty anyway. Suggested-by: Thomas Huth Signed-off-by: Marcel Apfelbaum Message-Id: <20180811171534.11917-1-marcel.apfelbaum@gmail.com> Reviewed-by: Yuval Shaia --- hw/rdma/vmw/pvrdma_cmd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c index b74b0b48d0..4faeb21631 100644 --- a/hw/rdma/vmw/pvrdma_cmd.c +++ b/hw/rdma/vmw/pvrdma_cmd.c @@ -16,7 +16,6 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" #include "cpu.h" -#include #include "hw/hw.h" #include "hw/pci/pci.h" #include "hw/pci/pci_ids.h" From 21ab34c9543fe1b6d31b3edbd01a397e7e090d00 Mon Sep 17 00:00:00 2001 From: Marcel Apfelbaum Date: Thu, 16 Aug 2018 18:16:37 +0300 Subject: [PATCH 15/15] config: split PVRDMA from RDMA In some BSD systems RDMA migration is possible while the pvrdma device can't be used because the mremap system call is missing. Reported-by: Rebecca Cran Signed-off-by: Marcel Apfelbaum Message-Id: <20180816151637.24553-1-marcel.apfelbaum@gmail.com> Reviewed-by: Thomas Huth --- configure | 55 ++++++++++++++++++++++++++++++++++++++++++- hw/rdma/Makefile.objs | 2 +- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/configure b/configure index db97930314..7d9a63636c 100755 --- a/configure +++ b/configure @@ -375,6 +375,7 @@ hax="no" hvf="no" whpx="no" rdma="" +pvrdma="" gprof="no" debug_tcg="no" debug="no" @@ -1363,6 +1364,10 @@ for opt do ;; --disable-rdma) rdma="no" ;; + --enable-pvrdma) pvrdma="yes" + ;; + --disable-pvrdma) pvrdma="no" + ;; --with-gtkabi=*) gtkabi="$optarg" ;; --disable-vte) vte="no" @@ -1669,7 +1674,8 @@ disabled with --disable-FEATURE, default is enabled if available: hax HAX acceleration support hvf Hypervisor.framework acceleration support whpx Windows Hypervisor Platform acceleration support - rdma Enable RDMA-based migration and PVRDMA support + rdma Enable RDMA-based migration + pvrdma Enable PVRDMA support vde support for vde network netmap support for netmap network linux-aio Linux AIO support @@ -3064,6 +3070,48 @@ EOF fi fi +########################################## +# PVRDMA detection + +cat > $TMPC < + +int +main(void) +{ + char buf = 0; + void *addr = &buf; + addr = mremap(addr, 0, 1, MREMAP_MAYMOVE | MREMAP_FIXED); + + return 0; +} +EOF + +if test "$rdma" = "yes" ; then + case "$pvrdma" in + "") + if compile_prog "" ""; then + pvrdma="yes" + else + pvrdma="no" + fi + ;; + "yes") + if ! compile_prog "" ""; then + error_exit "PVRDMA is not supported since mremap is not implemented" + fi + pvrdma="yes" + ;; + "no") + pvrdma="no" + ;; + esac +else + if test "$pvrdma" = "yes" ; then + error_exit "PVRDMA requires rdma suppport" + fi + pvrdma="no" +fi ########################################## # VNC SASL detection @@ -5952,6 +6000,7 @@ if test "$tcg" = "yes" ; then fi echo "malloc trim support $malloc_trim" echo "RDMA support $rdma" +echo "PVRDMA support $pvrdma" echo "fdt support $fdt" echo "membarrier $membarrier" echo "preadv support $preadv" @@ -6708,6 +6757,10 @@ if test "$rdma" = "yes" ; then echo "RDMA_LIBS=$rdma_libs" >> $config_host_mak fi +if test "$pvrdma" = "yes" ; then + echo "CONFIG_PVRDMA=y" >> $config_host_mak +fi + if test "$have_rtnetlink" = "yes" ; then echo "CONFIG_RTNETLINK=y" >> $config_host_mak fi diff --git a/hw/rdma/Makefile.objs b/hw/rdma/Makefile.objs index 3504c39d21..bd36cbf51c 100644 --- a/hw/rdma/Makefile.objs +++ b/hw/rdma/Makefile.objs @@ -1,4 +1,4 @@ -ifeq ($(CONFIG_RDMA),y) +ifeq ($(CONFIG_PVRDMA),y) obj-$(CONFIG_PCI) += rdma_utils.o rdma_backend.o rdma_rm.o obj-$(CONFIG_PCI) += vmw/pvrdma_dev_ring.o vmw/pvrdma_cmd.o \ vmw/pvrdma_qp_ops.o vmw/pvrdma_main.o