From 416efba0bd69b2f57da95049d505fff289269c11 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:40:51 -0700 Subject: [PATCH 01/34] xen/pvcalls: introduce the pvcalls xenbus frontend Introduce a xenbus frontend for the pvcalls protocol, as defined by https://xenbits.xen.org/docs/unstable/misc/pvcalls.html. This patch only adds the stubs, the code will be added by the following patches. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 61 +++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 drivers/xen/pvcalls-front.c diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c new file mode 100644 index 000000000000..a8d38c283014 --- /dev/null +++ b/drivers/xen/pvcalls-front.c @@ -0,0 +1,61 @@ +/* + * (c) 2017 Stefano Stabellini + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include + +#include +#include +#include +#include +#include + +static const struct xenbus_device_id pvcalls_front_ids[] = { + { "pvcalls" }, + { "" } +}; + +static int pvcalls_front_remove(struct xenbus_device *dev) +{ + return 0; +} + +static int pvcalls_front_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + return 0; +} + +static void pvcalls_front_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ +} + +static struct xenbus_driver pvcalls_front_driver = { + .ids = pvcalls_front_ids, + .probe = pvcalls_front_probe, + .remove = pvcalls_front_remove, + .otherend_changed = pvcalls_front_changed, +}; + +static int __init pvcalls_frontend_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("Initialising Xen pvcalls frontend driver\n"); + + return xenbus_register_frontend(&pvcalls_front_driver); +} + +module_init(pvcalls_frontend_init); From aa7ba37678307fab5a57c0b4868db7ce60701d7a Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:40:52 -0700 Subject: [PATCH 02/34] xen/pvcalls: implement frontend disconnect Introduce a data structure named pvcalls_bedata. It contains pointers to the command ring, the event channel, a list of active sockets and a list of passive sockets. Lists accesses are protected by a spin_lock. Introduce a waitqueue to allow waiting for a response on commands sent to the backend. Introduce an array of struct xen_pvcalls_response to store commands responses. Introduce a new struct sock_mapping to keep track of sockets. In this patch the struct sock_mapping is minimal, the fields will be added by the next patches. pvcalls_refcount is used to keep count of the outstanding pvcalls users. Only remove connections once the refcount is zero. Implement pvcalls frontend removal function. Go through the list of active and passive sockets and free them all, one at a time. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 71 +++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index a8d38c283014..aae23d046967 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -20,6 +20,51 @@ #include #include +#define PVCALLS_INVALID_ID UINT_MAX +#define PVCALLS_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER +#define PVCALLS_NR_RSP_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE) + +struct pvcalls_bedata { + struct xen_pvcalls_front_ring ring; + grant_ref_t ref; + int irq; + + struct list_head socket_mappings; + spinlock_t socket_lock; + + wait_queue_head_t inflight_req; + struct xen_pvcalls_response rsp[PVCALLS_NR_RSP_PER_RING]; +}; +/* Only one front/back connection supported. */ +static struct xenbus_device *pvcalls_front_dev; +static atomic_t pvcalls_refcount; + +/* first increment refcount, then proceed */ +#define pvcalls_enter() { \ + atomic_inc(&pvcalls_refcount); \ +} + +/* first complete other operations, then decrement refcount */ +#define pvcalls_exit() { \ + atomic_dec(&pvcalls_refcount); \ +} + +struct sock_mapping { + bool active_socket; + struct list_head list; + struct socket *sock; +}; + +static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id) +{ + return IRQ_HANDLED; +} + +static void pvcalls_front_free_map(struct pvcalls_bedata *bedata, + struct sock_mapping *map) +{ +} + static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } @@ -27,6 +72,32 @@ static const struct xenbus_device_id pvcalls_front_ids[] = { static int pvcalls_front_remove(struct xenbus_device *dev) { + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL, *n; + + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + dev_set_drvdata(&dev->dev, NULL); + pvcalls_front_dev = NULL; + if (bedata->irq >= 0) + unbind_from_irqhandler(bedata->irq, dev); + + smp_mb(); + while (atomic_read(&pvcalls_refcount) > 0) + cpu_relax(); + list_for_each_entry_safe(map, n, &bedata->socket_mappings, list) { + if (map->active_socket) { + /* No need to lock, refcount is 0 */ + pvcalls_front_free_map(bedata, map); + } else { + list_del(&map->list); + kfree(map); + } + } + if (bedata->ref >= 0) + gnttab_end_foreign_access(bedata->ref, 0, 0); + kfree(bedata->ring.sring); + kfree(bedata); + xenbus_switch_state(dev, XenbusStateClosed); return 0; } From 21968190991320c0d9b6d1280e7dfda420164db1 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:40:53 -0700 Subject: [PATCH 03/34] xen/pvcalls: connect to the backend Implement the probe function for the pvcalls frontend. Read the supported versions, max-page-order and function-calls nodes from xenstore. Only one frontend<->backend connection is supported at any given time for a guest. Store the active frontend device to a static pointer. Introduce a stub functions for the event handler. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 132 ++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index aae23d046967..1618502c1c66 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -104,12 +104,144 @@ static int pvcalls_front_remove(struct xenbus_device *dev) static int pvcalls_front_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { + int ret = -ENOMEM, evtchn, i; + unsigned int max_page_order, function_calls, len; + char *versions; + grant_ref_t gref_head = 0; + struct xenbus_transaction xbt; + struct pvcalls_bedata *bedata = NULL; + struct xen_pvcalls_sring *sring; + + if (pvcalls_front_dev != NULL) { + dev_err(&dev->dev, "only one PV Calls connection supported\n"); + return -EINVAL; + } + + versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); + if (!len) + return -EINVAL; + if (strcmp(versions, "1")) { + kfree(versions); + return -EINVAL; + } + kfree(versions); + max_page_order = xenbus_read_unsigned(dev->otherend, + "max-page-order", 0); + if (max_page_order < PVCALLS_RING_ORDER) + return -ENODEV; + function_calls = xenbus_read_unsigned(dev->otherend, + "function-calls", 0); + /* See XENBUS_FUNCTIONS_CALLS in pvcalls.h */ + if (function_calls != 1) + return -ENODEV; + pr_info("%s max-page-order is %u\n", __func__, max_page_order); + + bedata = kzalloc(sizeof(struct pvcalls_bedata), GFP_KERNEL); + if (!bedata) + return -ENOMEM; + + dev_set_drvdata(&dev->dev, bedata); + pvcalls_front_dev = dev; + init_waitqueue_head(&bedata->inflight_req); + INIT_LIST_HEAD(&bedata->socket_mappings); + spin_lock_init(&bedata->socket_lock); + bedata->irq = -1; + bedata->ref = -1; + + for (i = 0; i < PVCALLS_NR_RSP_PER_RING; i++) + bedata->rsp[i].req_id = PVCALLS_INVALID_ID; + + sring = (struct xen_pvcalls_sring *) __get_free_page(GFP_KERNEL | + __GFP_ZERO); + if (!sring) + goto error; + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&bedata->ring, sring, XEN_PAGE_SIZE); + + ret = xenbus_alloc_evtchn(dev, &evtchn); + if (ret) + goto error; + + bedata->irq = bind_evtchn_to_irqhandler(evtchn, + pvcalls_front_event_handler, + 0, "pvcalls-frontend", dev); + if (bedata->irq < 0) { + ret = bedata->irq; + goto error; + } + + ret = gnttab_alloc_grant_references(1, &gref_head); + if (ret < 0) + goto error; + bedata->ref = gnttab_claim_grant_reference(&gref_head); + if (bedata->ref < 0) { + ret = bedata->ref; + goto error; + } + gnttab_grant_foreign_access_ref(bedata->ref, dev->otherend_id, + virt_to_gfn((void *)sring), 0); + + again: + ret = xenbus_transaction_start(&xbt); + if (ret) { + xenbus_dev_fatal(dev, ret, "starting transaction"); + goto error; + } + ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "ring-ref", "%d", bedata->ref); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "port", "%u", + evtchn); + if (ret) + goto error_xenbus; + ret = xenbus_transaction_end(xbt, 0); + if (ret) { + if (ret == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, ret, "completing transaction"); + goto error; + } + xenbus_switch_state(dev, XenbusStateInitialised); + return 0; + + error_xenbus: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, ret, "writing xenstore"); + error: + pvcalls_front_remove(dev); + return ret; } static void pvcalls_front_changed(struct xenbus_device *dev, enum xenbus_state backend_state) { + switch (backend_state) { + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateUnknown: + break; + + case XenbusStateInitWait: + break; + + case XenbusStateConnected: + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's CLOSING state -- fallthrough */ + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } } static struct xenbus_driver pvcalls_front_driver = { From 2195046bfd69e487d9a76dc47840f15c8412840c Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:40:54 -0700 Subject: [PATCH 04/34] xen/pvcalls: implement socket command and handle events Send a PVCALLS_SOCKET command to the backend, use the masked req_prod_pvt as req_id. This way, req_id is guaranteed to be between 0 and PVCALLS_NR_REQ_PER_RING. We already have a slot in the rsp array ready for the response, and there cannot be two outstanding responses with the same req_id. Wait for the response by waiting on the inflight_req waitqueue and check for the req_id field in rsp[req_id]. Use atomic accesses and barriers to read the field. Note that the barriers are simple smp barriers (as opposed to virt barriers) because they are for internal frontend synchronization, not frontend<->backend communication. Once a response is received, clear the corresponding rsp slot by setting req_id to PVCALLS_INVALID_ID. Note that PVCALLS_INVALID_ID is invalid only from the frontend point of view. It is not part of the PVCalls protocol. pvcalls_front_event_handler is in charge of copying responses from the ring to the appropriate rsp slot. It is done by copying the body of the response first, then by copying req_id atomically. After the copies, wake up anybody waiting on waitqueue. socket_lock protects accesses to the ring. Convert the pointer to sock_mapping into an uintptr_t and use it as id for the new socket to pass to the backend. The struct will be fully initialized later on connect or bind. sock->sk->sk_send_head is not used for ip sockets: reuse the field to store a pointer to the struct sock_mapping corresponding to the socket. This way, we can easily get the struct sock_mapping from the struct socket. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 131 ++++++++++++++++++++++++++++++++++++ drivers/xen/pvcalls-front.h | 8 +++ 2 files changed, 139 insertions(+) create mode 100644 drivers/xen/pvcalls-front.h diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 1618502c1c66..326395d09e31 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -13,6 +13,10 @@ */ #include +#include +#include + +#include #include #include @@ -20,6 +24,8 @@ #include #include +#include "pvcalls-front.h" + #define PVCALLS_INVALID_ID UINT_MAX #define PVCALLS_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER #define PVCALLS_NR_RSP_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE) @@ -55,8 +61,58 @@ struct sock_mapping { struct socket *sock; }; +static inline int get_request(struct pvcalls_bedata *bedata, int *req_id) +{ + *req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); + if (RING_FULL(&bedata->ring) || + bedata->rsp[*req_id].req_id != PVCALLS_INVALID_ID) + return -EAGAIN; + return 0; +} + static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id) { + struct xenbus_device *dev = dev_id; + struct pvcalls_bedata *bedata; + struct xen_pvcalls_response *rsp; + uint8_t *src, *dst; + int req_id = 0, more = 0, done = 0; + + if (dev == NULL) + return IRQ_HANDLED; + + pvcalls_enter(); + bedata = dev_get_drvdata(&dev->dev); + if (bedata == NULL) { + pvcalls_exit(); + return IRQ_HANDLED; + } + +again: + while (RING_HAS_UNCONSUMED_RESPONSES(&bedata->ring)) { + rsp = RING_GET_RESPONSE(&bedata->ring, bedata->ring.rsp_cons); + + req_id = rsp->req_id; + dst = (uint8_t *)&bedata->rsp[req_id] + sizeof(rsp->req_id); + src = (uint8_t *)rsp + sizeof(rsp->req_id); + memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id)); + /* + * First copy the rest of the data, then req_id. It is + * paired with the barrier when accessing bedata->rsp. + */ + smp_wmb(); + bedata->rsp[req_id].req_id = rsp->req_id; + + done = 1; + bedata->ring.rsp_cons++; + } + + RING_FINAL_CHECK_FOR_RESPONSES(&bedata->ring, more); + if (more) + goto again; + if (done) + wake_up(&bedata->inflight_req); + pvcalls_exit(); return IRQ_HANDLED; } @@ -65,6 +121,81 @@ static void pvcalls_front_free_map(struct pvcalls_bedata *bedata, { } +int pvcalls_front_socket(struct socket *sock) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + + /* + * PVCalls only supports domain AF_INET, + * type SOCK_STREAM and protocol 0 sockets for now. + * + * Check socket type here, AF_INET and protocol checks are done + * by the caller. + */ + if (sock->type != SOCK_STREAM) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -EACCES; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (map == NULL) { + pvcalls_exit(); + return -ENOMEM; + } + + spin_lock(&bedata->socket_lock); + + ret = get_request(bedata, &req_id); + if (ret < 0) { + kfree(map); + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + + /* + * sock->sk->sk_send_head is not used for ip sockets: reuse the + * field to store a pointer to the struct sock_mapping + * corresponding to the socket. This way, we can easily get the + * struct sock_mapping from the struct socket. + */ + sock->sk->sk_send_head = (void *)map; + list_add_tail(&map->list, &bedata->socket_mappings); + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_SOCKET; + req->u.socket.id = (uintptr_t) map; + req->u.socket.domain = AF_INET; + req->u.socket.type = SOCK_STREAM; + req->u.socket.protocol = IPPROTO_IP; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + + pvcalls_exit(); + return ret; +} + static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h new file mode 100644 index 000000000000..b7dabedf5ccb --- /dev/null +++ b/drivers/xen/pvcalls-front.h @@ -0,0 +1,8 @@ +#ifndef __PVCALLS_FRONT_H__ +#define __PVCALLS_FRONT_H__ + +#include + +int pvcalls_front_socket(struct socket *sock); + +#endif From cb1c7d9bbc878a67af270ccd2d23f6d1450579db Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:40:55 -0700 Subject: [PATCH 05/34] xen/pvcalls: implement connect command Send PVCALLS_CONNECT to the backend. Allocate a new ring and evtchn for the active socket. Introduce fields in struct sock_mapping to keep track of active sockets. Introduce a waitqueue to allow the frontend to wait on data coming from the backend on the active socket (recvmsg command). Two mutexes (one of reads and one for writes) will be used to protect the active socket in and out rings from concurrent accesses. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 158 ++++++++++++++++++++++++++++++++++++ drivers/xen/pvcalls-front.h | 2 + 2 files changed, 160 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 326395d09e31..8d4a43e6aa46 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -59,6 +59,18 @@ struct sock_mapping { bool active_socket; struct list_head list; struct socket *sock; + union { + struct { + int irq; + grant_ref_t ref; + struct pvcalls_data_intf *ring; + struct pvcalls_data data; + struct mutex in_mutex; + struct mutex out_mutex; + + wait_queue_head_t inflight_conn_req; + } active; + }; }; static inline int get_request(struct pvcalls_bedata *bedata, int *req_id) @@ -121,6 +133,18 @@ static void pvcalls_front_free_map(struct pvcalls_bedata *bedata, { } +static irqreturn_t pvcalls_front_conn_handler(int irq, void *sock_map) +{ + struct sock_mapping *map = sock_map; + + if (map == NULL) + return IRQ_HANDLED; + + wake_up_interruptible(&map->active.inflight_conn_req); + + return IRQ_HANDLED; +} + int pvcalls_front_socket(struct socket *sock) { struct pvcalls_bedata *bedata; @@ -196,6 +220,132 @@ int pvcalls_front_socket(struct socket *sock) return ret; } +static int create_active(struct sock_mapping *map, int *evtchn) +{ + void *bytes; + int ret = -ENOMEM, irq = -1, i; + + *evtchn = -1; + init_waitqueue_head(&map->active.inflight_conn_req); + + map->active.ring = (struct pvcalls_data_intf *) + __get_free_page(GFP_KERNEL | __GFP_ZERO); + if (map->active.ring == NULL) + goto out_error; + map->active.ring->ring_order = PVCALLS_RING_ORDER; + bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + PVCALLS_RING_ORDER); + if (bytes == NULL) + goto out_error; + for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++) + map->active.ring->ref[i] = gnttab_grant_foreign_access( + pvcalls_front_dev->otherend_id, + pfn_to_gfn(virt_to_pfn(bytes) + i), 0); + + map->active.ref = gnttab_grant_foreign_access( + pvcalls_front_dev->otherend_id, + pfn_to_gfn(virt_to_pfn((void *)map->active.ring)), 0); + + map->active.data.in = bytes; + map->active.data.out = bytes + + XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + + ret = xenbus_alloc_evtchn(pvcalls_front_dev, evtchn); + if (ret) + goto out_error; + irq = bind_evtchn_to_irqhandler(*evtchn, pvcalls_front_conn_handler, + 0, "pvcalls-frontend", map); + if (irq < 0) { + ret = irq; + goto out_error; + } + + map->active.irq = irq; + map->active_socket = true; + mutex_init(&map->active.in_mutex); + mutex_init(&map->active.out_mutex); + + return 0; + +out_error: + if (irq >= 0) + unbind_from_irqhandler(irq, map); + else if (*evtchn >= 0) + xenbus_free_evtchn(pvcalls_front_dev, *evtchn); + kfree(map->active.data.in); + kfree(map->active.ring); + return ret; +} + +int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret, evtchn; + + if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *)sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + ret = create_active(map, &evtchn); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_CONNECT; + req->u.connect.id = (uintptr_t)map; + req->u.connect.len = addr_len; + req->u.connect.flags = flags; + req->u.connect.ref = map->active.ref; + req->u.connect.evtchn = evtchn; + memcpy(req->u.connect.addr, addr, sizeof(*addr)); + + map->sock = sock; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + pvcalls_exit(); + return ret; +} + static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } @@ -212,6 +362,14 @@ static int pvcalls_front_remove(struct xenbus_device *dev) if (bedata->irq >= 0) unbind_from_irqhandler(bedata->irq, dev); + list_for_each_entry_safe(map, n, &bedata->socket_mappings, list) { + map->sock->sk->sk_send_head = NULL; + if (map->active_socket) { + map->active.ring->in_error = -EBADF; + wake_up_interruptible(&map->active.inflight_conn_req); + } + } + smp_mb(); while (atomic_read(&pvcalls_refcount) > 0) cpu_relax(); diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index b7dabedf5ccb..63b0417c31d3 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -4,5 +4,7 @@ #include int pvcalls_front_socket(struct socket *sock); +int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, + int addr_len, int flags); #endif From 67ea9893cbc2f892f491ec620d6bbab262b80770 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:40:56 -0700 Subject: [PATCH 06/34] xen/pvcalls: implement bind command Send PVCALLS_BIND to the backend. Introduce a new structure, part of struct sock_mapping, to store information specific to passive sockets. Introduce a status field to keep track of the status of the passive socket. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 66 +++++++++++++++++++++++++++++++++++++ drivers/xen/pvcalls-front.h | 3 ++ 2 files changed, 69 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 8d4a43e6aa46..6e70ec3ea796 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -70,6 +70,13 @@ struct sock_mapping { wait_queue_head_t inflight_conn_req; } active; + struct { + /* Socket status */ +#define PVCALLS_STATUS_UNINITALIZED 0 +#define PVCALLS_STATUS_BIND 1 +#define PVCALLS_STATUS_LISTEN 2 + uint8_t status; + } passive; }; }; @@ -346,6 +353,65 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, return ret; } +int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + + if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (map == NULL) { + pvcalls_exit(); + return -ENOTSOCK; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + map->sock = sock; + req->cmd = PVCALLS_BIND; + req->u.bind.id = (uintptr_t)map; + memcpy(req->u.bind.addr, addr, sizeof(*addr)); + req->u.bind.len = addr_len; + + map->active_socket = false; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + + map->passive.status = PVCALLS_STATUS_BIND; + pvcalls_exit(); + return 0; +} + static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index 63b0417c31d3..8b0a2749d4b8 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -6,5 +6,8 @@ int pvcalls_front_socket(struct socket *sock); int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, int addr_len, int flags); +int pvcalls_front_bind(struct socket *sock, + struct sockaddr *addr, + int addr_len); #endif From 1853f11d72ed46310ff3c5a60264b2f8a53f8c25 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:40:57 -0700 Subject: [PATCH 07/34] xen/pvcalls: implement listen command Send PVCALLS_LISTEN to the backend. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 57 +++++++++++++++++++++++++++++++++++++ drivers/xen/pvcalls-front.h | 1 + 2 files changed, 58 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 6e70ec3ea796..09b2a64fef91 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -412,6 +412,63 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) return 0; } +int pvcalls_front_listen(struct socket *sock, int backlog) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + struct xen_pvcalls_request *req; + int notify, req_id, ret; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + if (map->passive.status != PVCALLS_STATUS_BIND) { + pvcalls_exit(); + return -EOPNOTSUPP; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_LISTEN; + req->u.listen.id = (uintptr_t) map; + req->u.listen.backlog = backlog; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + /* read req_id, then the content */ + smp_rmb(); + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + + map->passive.status = PVCALLS_STATUS_LISTEN; + pvcalls_exit(); + return ret; +} + static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index 8b0a2749d4b8..aa8fe1007da5 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -9,5 +9,6 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len); +int pvcalls_front_listen(struct socket *sock, int backlog); #endif From 9774c6cca26610d065a75d2ac8c5e3fcf0a209b3 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:40:58 -0700 Subject: [PATCH 08/34] xen/pvcalls: implement accept command Introduce a waitqueue to allow only one outstanding accept command at any given time and to implement polling on the passive socket. Introduce a flags field to keep track of in-flight accept and poll commands. Send PVCALLS_ACCEPT to the backend. Allocate a new active socket. Make sure that only one accept command is executed at any given time by setting PVCALLS_FLAG_ACCEPT_INFLIGHT and waiting on the inflight_accept_req waitqueue. Convert the new struct sock_mapping pointer into an uintptr_t and use it as id for the new socket to pass to the backend. Check if the accept call is non-blocking: in that case after sending the ACCEPT command to the backend store the sock_mapping pointer of the new struct and the inflight req_id then return -EAGAIN (which will respond only when there is something to accept). Next time accept is called, we'll check if the ACCEPT command has been answered, if so we'll pick up where we left off, otherwise we return -EAGAIN again. Note that, differently from the other commands, we can use wait_event_interruptible (instead of wait_event) in the case of accept as we are able to track the req_id of the ACCEPT response that we are waiting. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 145 ++++++++++++++++++++++++++++++++++++ drivers/xen/pvcalls-front.h | 3 + 2 files changed, 148 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 09b2a64fef91..d781ac4bcbde 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -76,6 +76,16 @@ struct sock_mapping { #define PVCALLS_STATUS_BIND 1 #define PVCALLS_STATUS_LISTEN 2 uint8_t status; + /* + * Internal state-machine flags. + * Only one accept operation can be inflight for a socket. + * Only one poll operation can be inflight for a given socket. + */ +#define PVCALLS_FLAG_ACCEPT_INFLIGHT 0 + uint8_t flags; + uint32_t inflight_req_id; + struct sock_mapping *accept_map; + wait_queue_head_t inflight_accept_req; } passive; }; }; @@ -391,6 +401,8 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) memcpy(req->u.bind.addr, addr, sizeof(*addr)); req->u.bind.len = addr_len; + init_waitqueue_head(&map->passive.inflight_accept_req); + map->active_socket = false; bedata->ring.req_prod_pvt++; @@ -469,6 +481,139 @@ int pvcalls_front_listen(struct socket *sock, int backlog) return ret; } +int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + struct sock_mapping *map2 = NULL; + struct xen_pvcalls_request *req; + int notify, req_id, ret, evtchn, nonblock; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + if (map->passive.status != PVCALLS_STATUS_LISTEN) { + pvcalls_exit(); + return -EINVAL; + } + + nonblock = flags & SOCK_NONBLOCK; + /* + * Backend only supports 1 inflight accept request, will return + * errors for the others + */ + if (test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags)) { + req_id = READ_ONCE(map->passive.inflight_req_id); + if (req_id != PVCALLS_INVALID_ID && + READ_ONCE(bedata->rsp[req_id].req_id) == req_id) { + map2 = map->passive.accept_map; + goto received; + } + if (nonblock) { + pvcalls_exit(); + return -EAGAIN; + } + if (wait_event_interruptible(map->passive.inflight_accept_req, + !test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags))) { + pvcalls_exit(); + return -EINTR; + } + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + map2 = kzalloc(sizeof(*map2), GFP_KERNEL); + if (map2 == NULL) { + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return -ENOMEM; + } + ret = create_active(map2, &evtchn); + if (ret < 0) { + kfree(map2); + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + list_add_tail(&map2->list, &bedata->socket_mappings); + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_ACCEPT; + req->u.accept.id = (uintptr_t) map; + req->u.accept.ref = map2->active.ref; + req->u.accept.id_new = (uintptr_t) map2; + req->u.accept.evtchn = evtchn; + map->passive.accept_map = map2; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + /* We could check if we have received a response before returning. */ + if (nonblock) { + WRITE_ONCE(map->passive.inflight_req_id, req_id); + pvcalls_exit(); + return -EAGAIN; + } + + if (wait_event_interruptible(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) { + pvcalls_exit(); + return -EINTR; + } + /* read req_id, then the content */ + smp_rmb(); + +received: + map2->sock = newsock; + newsock->sk = kzalloc(sizeof(*newsock->sk), GFP_KERNEL); + if (!newsock->sk) { + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + map->passive.inflight_req_id = PVCALLS_INVALID_ID; + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags); + pvcalls_front_free_map(bedata, map2); + pvcalls_exit(); + return -ENOMEM; + } + newsock->sk->sk_send_head = (void *)map2; + + ret = bedata->rsp[req_id].ret; + bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; + map->passive.inflight_req_id = PVCALLS_INVALID_ID; + + clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); + wake_up(&map->passive.inflight_accept_req); + + pvcalls_exit(); + return ret; +} + static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index aa8fe1007da5..ab4f1dad3142 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -10,5 +10,8 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len); int pvcalls_front_listen(struct socket *sock, int backlog); +int pvcalls_front_accept(struct socket *sock, + struct socket *newsock, + int flags); #endif From 45ddce214a797de65c0418bebf90c1b564c3ee3d Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:40:59 -0700 Subject: [PATCH 09/34] xen/pvcalls: implement sendmsg Send data to an active socket by copying data to the "out" ring. Take the active socket out_mutex so that only one function can access the ring at any given time. If not enough room is available on the ring, rather than returning immediately or sleep-waiting, spin for up to 5000 cycles. This small optimization turns out to improve performance significantly. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 121 ++++++++++++++++++++++++++++++++++++ drivers/xen/pvcalls-front.h | 3 + 2 files changed, 124 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index d781ac4bcbde..76725780e767 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -29,6 +29,7 @@ #define PVCALLS_INVALID_ID UINT_MAX #define PVCALLS_RING_ORDER XENBUS_MAX_RING_GRANT_ORDER #define PVCALLS_NR_RSP_PER_RING __CONST_RING_SIZE(xen_pvcalls, XEN_PAGE_SIZE) +#define PVCALLS_FRONT_MAX_SPIN 5000 struct pvcalls_bedata { struct xen_pvcalls_front_ring ring; @@ -99,6 +100,23 @@ static inline int get_request(struct pvcalls_bedata *bedata, int *req_id) return 0; } +static bool pvcalls_front_write_todo(struct sock_mapping *map) +{ + struct pvcalls_data_intf *intf = map->active.ring; + RING_IDX cons, prod, size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + int32_t error; + + error = intf->out_error; + if (error == -ENOTCONN) + return false; + if (error != 0) + return true; + + cons = intf->out_cons; + prod = intf->out_prod; + return !!(size - pvcalls_queued(prod, cons, size)); +} + static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id) { struct xenbus_device *dev = dev_id; @@ -363,6 +381,109 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, return ret; } +static int __write_ring(struct pvcalls_data_intf *intf, + struct pvcalls_data *data, + struct iov_iter *msg_iter, + int len) +{ + RING_IDX cons, prod, size, masked_prod, masked_cons; + RING_IDX array_size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + int32_t error; + + error = intf->out_error; + if (error < 0) + return error; + cons = intf->out_cons; + prod = intf->out_prod; + /* read indexes before continuing */ + virt_mb(); + + size = pvcalls_queued(prod, cons, array_size); + if (size >= array_size) + return -EINVAL; + if (len > array_size - size) + len = array_size - size; + + masked_prod = pvcalls_mask(prod, array_size); + masked_cons = pvcalls_mask(cons, array_size); + + if (masked_prod < masked_cons) { + len = copy_from_iter(data->out + masked_prod, len, msg_iter); + } else { + if (len > array_size - masked_prod) { + int ret = copy_from_iter(data->out + masked_prod, + array_size - masked_prod, msg_iter); + if (ret != array_size - masked_prod) { + len = ret; + goto out; + } + len = ret + copy_from_iter(data->out, len - ret, msg_iter); + } else { + len = copy_from_iter(data->out + masked_prod, len, msg_iter); + } + } +out: + /* write to ring before updating pointer */ + virt_wmb(); + intf->out_prod += len; + + return len; +} + +int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, + size_t len) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + int sent, tot_sent = 0; + int count = 0, flags; + + flags = msg->msg_flags; + if (flags & (MSG_CONFIRM|MSG_DONTROUTE|MSG_EOR|MSG_OOB)) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + mutex_lock(&map->active.out_mutex); + if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) { + mutex_unlock(&map->active.out_mutex); + pvcalls_exit(); + return -EAGAIN; + } + if (len > INT_MAX) + len = INT_MAX; + +again: + count++; + sent = __write_ring(map->active.ring, + &map->active.data, &msg->msg_iter, + len); + if (sent > 0) { + len -= sent; + tot_sent += sent; + notify_remote_via_irq(map->active.irq); + } + if (sent >= 0 && len > 0 && count < PVCALLS_FRONT_MAX_SPIN) + goto again; + if (sent < 0) + tot_sent = sent; + + mutex_unlock(&map->active.out_mutex); + pvcalls_exit(); + return tot_sent; +} + int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct pvcalls_bedata *bedata; diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index ab4f1dad3142..d937c24c90c3 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -13,5 +13,8 @@ int pvcalls_front_listen(struct socket *sock, int backlog); int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags); +int pvcalls_front_sendmsg(struct socket *sock, + struct msghdr *msg, + size_t len); #endif From ae0d04052e077ccb71772bf5c7cb1049f4d587d2 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:41:00 -0700 Subject: [PATCH 10/34] xen/pvcalls: implement recvmsg Implement recvmsg by copying data from the "in" ring. If not enough data is available and the recvmsg call is blocking, then wait on the inflight_conn_req waitqueue. Take the active socket in_mutex so that only one function can access the ring at any given time. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 111 ++++++++++++++++++++++++++++++++++++ drivers/xen/pvcalls-front.h | 4 ++ 2 files changed, 115 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 76725780e767..24ffdc90e217 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -117,6 +117,20 @@ static bool pvcalls_front_write_todo(struct sock_mapping *map) return !!(size - pvcalls_queued(prod, cons, size)); } +static bool pvcalls_front_read_todo(struct sock_mapping *map) +{ + struct pvcalls_data_intf *intf = map->active.ring; + RING_IDX cons, prod; + int32_t error; + + cons = intf->in_cons; + prod = intf->in_prod; + error = intf->in_error; + return (error != 0 || + pvcalls_queued(prod, cons, + XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) != 0); +} + static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id) { struct xenbus_device *dev = dev_id; @@ -484,6 +498,103 @@ int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, return tot_sent; } +static int __read_ring(struct pvcalls_data_intf *intf, + struct pvcalls_data *data, + struct iov_iter *msg_iter, + size_t len, int flags) +{ + RING_IDX cons, prod, size, masked_prod, masked_cons; + RING_IDX array_size = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + int32_t error; + + cons = intf->in_cons; + prod = intf->in_prod; + error = intf->in_error; + /* get pointers before reading from the ring */ + virt_rmb(); + if (error < 0) + return error; + + size = pvcalls_queued(prod, cons, array_size); + masked_prod = pvcalls_mask(prod, array_size); + masked_cons = pvcalls_mask(cons, array_size); + + if (size == 0) + return 0; + + if (len > size) + len = size; + + if (masked_prod > masked_cons) { + len = copy_to_iter(data->in + masked_cons, len, msg_iter); + } else { + if (len > (array_size - masked_cons)) { + int ret = copy_to_iter(data->in + masked_cons, + array_size - masked_cons, msg_iter); + if (ret != array_size - masked_cons) { + len = ret; + goto out; + } + len = ret + copy_to_iter(data->in, len - ret, msg_iter); + } else { + len = copy_to_iter(data->in + masked_cons, len, msg_iter); + } + } +out: + /* read data from the ring before increasing the index */ + virt_mb(); + if (!(flags & MSG_PEEK)) + intf->in_cons += len; + + return len; +} + +int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct pvcalls_bedata *bedata; + int ret; + struct sock_mapping *map; + + if (flags & (MSG_CMSG_CLOEXEC|MSG_ERRQUEUE|MSG_OOB|MSG_TRUNC)) + return -EOPNOTSUPP; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -ENOTCONN; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return -ENOTSOCK; + } + + mutex_lock(&map->active.in_mutex); + if (len > XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) + len = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); + + while (!(flags & MSG_DONTWAIT) && !pvcalls_front_read_todo(map)) { + wait_event_interruptible(map->active.inflight_conn_req, + pvcalls_front_read_todo(map)); + } + ret = __read_ring(map->active.ring, &map->active.data, + &msg->msg_iter, len, flags); + + if (ret > 0) + notify_remote_via_irq(map->active.irq); + if (ret == 0) + ret = (flags & MSG_DONTWAIT) ? -EAGAIN : 0; + if (ret == -ENOTCONN) + ret = 0; + + mutex_unlock(&map->active.in_mutex); + pvcalls_exit(); + return ret; +} + int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct pvcalls_bedata *bedata; diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index d937c24c90c3..de24041e9a86 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -16,5 +16,9 @@ int pvcalls_front_accept(struct socket *sock, int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, size_t len); +int pvcalls_front_recvmsg(struct socket *sock, + struct msghdr *msg, + size_t len, + int flags); #endif From 5842c83596fcfa742978ec2840440ab56c7fdf79 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:41:01 -0700 Subject: [PATCH 11/34] xen/pvcalls: implement poll command For active sockets, check the indexes and use the inflight_conn_req waitqueue to wait. For passive sockets if an accept is outstanding (PVCALLS_FLAG_ACCEPT_INFLIGHT), check if it has been answered by looking at bedata->rsp[req_id]. If so, return POLLIN. Otherwise use the inflight_accept_req waitqueue. If no accepts are inflight, send PVCALLS_POLL to the backend. If we have outstanding POLL requests awaiting for a response use the inflight_req waitqueue: inflight_req is awaken when a new response is received; on wakeup we check whether the POLL response is arrived by looking at the PVCALLS_FLAG_POLL_RET flag. We set the flag from pvcalls_front_event_handler, if the response was for a POLL command. In pvcalls_front_event_handler, get the struct sock_mapping from the poll id (we previously converted struct sock_mapping* to uintptr_t and used it as id). Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 144 +++++++++++++++++++++++++++++++++--- drivers/xen/pvcalls-front.h | 3 + 2 files changed, 138 insertions(+), 9 deletions(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 24ffdc90e217..c7d4251c3678 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -83,6 +83,8 @@ struct sock_mapping { * Only one poll operation can be inflight for a given socket. */ #define PVCALLS_FLAG_ACCEPT_INFLIGHT 0 +#define PVCALLS_FLAG_POLL_INFLIGHT 1 +#define PVCALLS_FLAG_POLL_RET 2 uint8_t flags; uint32_t inflight_req_id; struct sock_mapping *accept_map; @@ -154,15 +156,32 @@ static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id) rsp = RING_GET_RESPONSE(&bedata->ring, bedata->ring.rsp_cons); req_id = rsp->req_id; - dst = (uint8_t *)&bedata->rsp[req_id] + sizeof(rsp->req_id); - src = (uint8_t *)rsp + sizeof(rsp->req_id); - memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id)); - /* - * First copy the rest of the data, then req_id. It is - * paired with the barrier when accessing bedata->rsp. - */ - smp_wmb(); - bedata->rsp[req_id].req_id = rsp->req_id; + if (rsp->cmd == PVCALLS_POLL) { + struct sock_mapping *map = (struct sock_mapping *)(uintptr_t) + rsp->u.poll.id; + + clear_bit(PVCALLS_FLAG_POLL_INFLIGHT, + (void *)&map->passive.flags); + /* + * clear INFLIGHT, then set RET. It pairs with + * the checks at the beginning of + * pvcalls_front_poll_passive. + */ + smp_wmb(); + set_bit(PVCALLS_FLAG_POLL_RET, + (void *)&map->passive.flags); + } else { + dst = (uint8_t *)&bedata->rsp[req_id] + + sizeof(rsp->req_id); + src = (uint8_t *)rsp + sizeof(rsp->req_id); + memcpy(dst, src, sizeof(*rsp) - sizeof(rsp->req_id)); + /* + * First copy the rest of the data, then req_id. It is + * paired with the barrier when accessing bedata->rsp. + */ + smp_wmb(); + bedata->rsp[req_id].req_id = req_id; + } done = 1; bedata->ring.rsp_cons++; @@ -846,6 +865,113 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) return ret; } +static unsigned int pvcalls_front_poll_passive(struct file *file, + struct pvcalls_bedata *bedata, + struct sock_mapping *map, + poll_table *wait) +{ + int notify, req_id, ret; + struct xen_pvcalls_request *req; + + if (test_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, + (void *)&map->passive.flags)) { + uint32_t req_id = READ_ONCE(map->passive.inflight_req_id); + + if (req_id != PVCALLS_INVALID_ID && + READ_ONCE(bedata->rsp[req_id].req_id) == req_id) + return POLLIN | POLLRDNORM; + + poll_wait(file, &map->passive.inflight_accept_req, wait); + return 0; + } + + if (test_and_clear_bit(PVCALLS_FLAG_POLL_RET, + (void *)&map->passive.flags)) + return POLLIN | POLLRDNORM; + + /* + * First check RET, then INFLIGHT. No barriers necessary to + * ensure execution ordering because of the conditional + * instructions creating control dependencies. + */ + + if (test_and_set_bit(PVCALLS_FLAG_POLL_INFLIGHT, + (void *)&map->passive.flags)) { + poll_wait(file, &bedata->inflight_req, wait); + return 0; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + return ret; + } + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_POLL; + req->u.poll.id = (uintptr_t) map; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + poll_wait(file, &bedata->inflight_req, wait); + return 0; +} + +static unsigned int pvcalls_front_poll_active(struct file *file, + struct pvcalls_bedata *bedata, + struct sock_mapping *map, + poll_table *wait) +{ + unsigned int mask = 0; + int32_t in_error, out_error; + struct pvcalls_data_intf *intf = map->active.ring; + + out_error = intf->out_error; + in_error = intf->in_error; + + poll_wait(file, &map->active.inflight_conn_req, wait); + if (pvcalls_front_write_todo(map)) + mask |= POLLOUT | POLLWRNORM; + if (pvcalls_front_read_todo(map)) + mask |= POLLIN | POLLRDNORM; + if (in_error != 0 || out_error != 0) + mask |= POLLERR; + + return mask; +} + +unsigned int pvcalls_front_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + int ret; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return POLLNVAL; + } + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (!map) { + pvcalls_exit(); + return POLLNVAL; + } + if (map->active_socket) + ret = pvcalls_front_poll_active(file, bedata, map, wait); + else + ret = pvcalls_front_poll_passive(file, bedata, map, wait); + pvcalls_exit(); + return ret; +} + static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index de24041e9a86..25e05b8f2d72 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -20,5 +20,8 @@ int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); +unsigned int pvcalls_front_poll(struct file *file, + struct socket *sock, + poll_table *wait); #endif From 235a71c5390316429e2c7823119c5ea439816beb Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:41:02 -0700 Subject: [PATCH 12/34] xen/pvcalls: implement release command Send PVCALLS_RELEASE to the backend and wait for a reply. Take both in_mutex and out_mutex to avoid concurrent accesses. Then, free the socket. For passive sockets, check whether we have already pre-allocated an active socket for the purpose of being accepted. If so, free that as well. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 98 +++++++++++++++++++++++++++++++++++++ drivers/xen/pvcalls-front.h | 1 + 2 files changed, 99 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index c7d4251c3678..0c1ec6894cc4 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -199,6 +199,21 @@ static irqreturn_t pvcalls_front_event_handler(int irq, void *dev_id) static void pvcalls_front_free_map(struct pvcalls_bedata *bedata, struct sock_mapping *map) { + int i; + + unbind_from_irqhandler(map->active.irq, map); + + spin_lock(&bedata->socket_lock); + if (!list_empty(&map->list)) + list_del_init(&map->list); + spin_unlock(&bedata->socket_lock); + + for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++) + gnttab_end_foreign_access(map->active.ring->ref[i], 0, 0); + gnttab_end_foreign_access(map->active.ref, 0, 0); + free_page((unsigned long)map->active.ring); + + kfree(map); } static irqreturn_t pvcalls_front_conn_handler(int irq, void *sock_map) @@ -972,6 +987,89 @@ unsigned int pvcalls_front_poll(struct file *file, struct socket *sock, return ret; } +int pvcalls_front_release(struct socket *sock) +{ + struct pvcalls_bedata *bedata; + struct sock_mapping *map; + int req_id, notify, ret; + struct xen_pvcalls_request *req; + + if (sock->sk == NULL) + return 0; + + pvcalls_enter(); + if (!pvcalls_front_dev) { + pvcalls_exit(); + return -EIO; + } + + bedata = dev_get_drvdata(&pvcalls_front_dev->dev); + + map = (struct sock_mapping *) sock->sk->sk_send_head; + if (map == NULL) { + pvcalls_exit(); + return 0; + } + + spin_lock(&bedata->socket_lock); + ret = get_request(bedata, &req_id); + if (ret < 0) { + spin_unlock(&bedata->socket_lock); + pvcalls_exit(); + return ret; + } + sock->sk->sk_send_head = NULL; + + req = RING_GET_REQUEST(&bedata->ring, req_id); + req->req_id = req_id; + req->cmd = PVCALLS_RELEASE; + req->u.release.id = (uintptr_t)map; + + bedata->ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&bedata->ring, notify); + spin_unlock(&bedata->socket_lock); + if (notify) + notify_remote_via_irq(bedata->irq); + + wait_event(bedata->inflight_req, + READ_ONCE(bedata->rsp[req_id].req_id) == req_id); + + if (map->active_socket) { + /* + * Set in_error and wake up inflight_conn_req to force + * recvmsg waiters to exit. + */ + map->active.ring->in_error = -EBADF; + wake_up_interruptible(&map->active.inflight_conn_req); + + /* + * Wait until there are no more waiters on the mutexes. + * We know that no new waiters can be added because sk_send_head + * is set to NULL -- we only need to wait for the existing + * waiters to return. + */ + while (!mutex_trylock(&map->active.in_mutex) || + !mutex_trylock(&map->active.out_mutex)) + cpu_relax(); + + pvcalls_front_free_map(bedata, map); + } else { + spin_lock(&bedata->socket_lock); + list_del(&map->list); + spin_unlock(&bedata->socket_lock); + if (READ_ONCE(map->passive.inflight_req_id) != + PVCALLS_INVALID_ID) { + pvcalls_front_free_map(bedata, + map->passive.accept_map); + } + kfree(map); + } + WRITE_ONCE(bedata->rsp[req_id].req_id, PVCALLS_INVALID_ID); + + pvcalls_exit(); + return 0; +} + static const struct xenbus_device_id pvcalls_front_ids[] = { { "pvcalls" }, { "" } diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h index 25e05b8f2d72..3332978f4fcd 100644 --- a/drivers/xen/pvcalls-front.h +++ b/drivers/xen/pvcalls-front.h @@ -23,5 +23,6 @@ int pvcalls_front_recvmsg(struct socket *sock, unsigned int pvcalls_front_poll(struct file *file, struct socket *sock, poll_table *wait); +int pvcalls_front_release(struct socket *sock); #endif From 5eee149ab9f83d1d0eb16c6b32a97d7060eeb490 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 30 Oct 2017 15:41:03 -0700 Subject: [PATCH 13/34] xen: introduce a Kconfig option to enable the pvcalls frontend Also add pvcalls-front to the Makefile. Signed-off-by: Stefano Stabellini Reviewed-by: Boris Ostrovsky CC: boris.ostrovsky@oracle.com CC: jgross@suse.com Signed-off-by: Boris Ostrovsky --- drivers/xen/Kconfig | 11 +++++++++++ drivers/xen/Makefile | 1 + 2 files changed, 12 insertions(+) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 4545561954ee..d8dd54678ab7 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -196,6 +196,17 @@ config XEN_PCIDEV_BACKEND If in doubt, say m. +config XEN_PVCALLS_FRONTEND + tristate "XEN PV Calls frontend driver" + depends on INET && XEN + default n + select XEN_XENBUS_FRONTEND + help + Experimental frontend for the Xen PV Calls protocol + (https://xenbits.xen.org/docs/unstable/misc/pvcalls.html). It + sends a small set of POSIX calls to the backend, which + implements them. + config XEN_PVCALLS_BACKEND bool "XEN PV Calls backend driver" depends on INET && XEN && XEN_BACKEND diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index caaa15dc37bc..82466f265164 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_XEN_EFI) += efi.o obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o obj-$(CONFIG_XEN_AUTO_XLATE) += xlate_mmu.o obj-$(CONFIG_XEN_PVCALLS_BACKEND) += pvcalls-back.o +obj-$(CONFIG_XEN_PVCALLS_FRONTEND) += pvcalls-front.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o xen-gntalloc-y := gntalloc.o From 6f0e8bf16730a36ff6773802d8c8df56d10e60cd Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 27 Oct 2017 19:49:37 +0200 Subject: [PATCH 14/34] xen: support 52 bit physical addresses in pv guests Physical addresses on processors supporting 5 level paging can be up to 52 bits wide. For a Xen pv guest running on such a machine those physical addresses have to be supported in order to be able to use any memory on the machine even if the guest itself does not support 5 level paging. So when reading/writing a MFN from/to a pte don't use the kernel's PTE_PFN_MASK but a new XEN_PTE_MFN_MASK allowing full 40 bit wide MFNs. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- arch/x86/include/asm/xen/page.h | 11 ++++++++++- arch/x86/xen/mmu_pv.c | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 07b6531813c4..90e91003fd9d 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -26,6 +26,15 @@ typedef struct xpaddr { phys_addr_t paddr; } xpaddr_t; +#ifdef CONFIG_X86_64 +#define XEN_PHYSICAL_MASK __sme_clr((1UL << 52) - 1) +#else +#define XEN_PHYSICAL_MASK __PHYSICAL_MASK +#endif + +#define XEN_PTE_MFN_MASK ((pteval_t)(((signed long)PAGE_MASK) & \ + XEN_PHYSICAL_MASK)) + #define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) #define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) @@ -277,7 +286,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) static inline unsigned long pte_mfn(pte_t pte) { - return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pte.pte & XEN_PTE_MFN_MASK) >> PAGE_SHIFT; } static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 71495f1a86d7..9d9cc3870722 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -315,7 +315,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, static pteval_t pte_mfn_to_pfn(pteval_t val) { if (val & _PAGE_PRESENT) { - unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT; unsigned long pfn = mfn_to_pfn(mfn); pteval_t flags = val & PTE_FLAGS_MASK; @@ -1735,7 +1735,7 @@ static unsigned long __init m2p(phys_addr_t maddr) { phys_addr_t paddr; - maddr &= PTE_PFN_MASK; + maddr &= XEN_PTE_MFN_MASK; paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; return paddr; From 5e25f5db6abb96ca8ee2aaedcb863daa6dfcc07a Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Wed, 1 Nov 2017 09:46:33 +0800 Subject: [PATCH 15/34] xen/time: do not decrease steal time after live migration on xen After guest live migration on xen, steal time in /proc/stat (cpustat[CPUTIME_STEAL]) might decrease because steal returned by xen_steal_lock() might be less than this_rq()->prev_steal_time which is derived from previous return value of xen_steal_clock(). For instance, steal time of each vcpu is 335 before live migration. cpu 198 0 368 200064 1962 0 0 1340 0 0 cpu0 38 0 81 50063 492 0 0 335 0 0 cpu1 65 0 97 49763 634 0 0 335 0 0 cpu2 38 0 81 50098 462 0 0 335 0 0 cpu3 56 0 107 50138 374 0 0 335 0 0 After live migration, steal time is reduced to 312. cpu 200 0 370 200330 1971 0 0 1248 0 0 cpu0 38 0 82 50123 500 0 0 312 0 0 cpu1 65 0 97 49832 634 0 0 312 0 0 cpu2 39 0 82 50167 462 0 0 312 0 0 cpu3 56 0 107 50207 374 0 0 312 0 0 Since runstate times are cumulative and cleared during xen live migration by xen hypervisor, the idea of this patch is to accumulate runstate times to global percpu variables before live migration suspend. Once guest VM is resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new runstate times and previously accumulated times stored in global percpu variables. Comment above HYPERVISOR_suspend() has been removed as it is inaccurate: the call can return an error code (e.g., possibly -EPERM in the future). Similar and more severe issue would impact prior linux 4.8-4.10 as discussed by Michael Las at https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest, which would overflow steal time and lead to 100% st usage in top command for linux 4.8-4.10. A backport of this patch would fix that issue. [boris: added linux/slab.h to driver/xen/time.c, slightly reformatted commit message] References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest Signed-off-by: Dongli Zhang Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/manage.c | 7 ++--- drivers/xen/time.c | 72 +++++++++++++++++++++++++++++++++++++++++-- include/xen/xen-ops.h | 1 + 3 files changed, 73 insertions(+), 7 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index c425d03d37d2..8835065029d3 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -72,18 +72,15 @@ static int xen_suspend(void *data) } gnttab_suspend(); + xen_manage_runstate_time(-1); xen_arch_pre_suspend(); - /* - * This hypercall returns 1 if suspend was cancelled - * or the domain was merely checkpointed, and 0 if it - * is resuming in a new domain. - */ si->cancelled = HYPERVISOR_suspend(xen_pv_domain() ? virt_to_gfn(xen_start_info) : 0); xen_arch_post_suspend(si->cancelled); + xen_manage_runstate_time(si->cancelled ? 1 : 0); gnttab_resume(); if (!si->cancelled) { diff --git a/drivers/xen/time.c b/drivers/xen/time.c index ac5f23fcafc2..8c46f555d82a 100644 --- a/drivers/xen/time.c +++ b/drivers/xen/time.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,8 @@ /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); +static DEFINE_PER_CPU(u64[4], old_runstate_time); + /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) { @@ -47,8 +50,8 @@ static u64 get64(const u64 *p) return ret; } -static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, - unsigned int cpu) +static void xen_get_runstate_snapshot_cpu_delta( + struct vcpu_runstate_info *res, unsigned int cpu) { u64 state_time; struct vcpu_runstate_info *state; @@ -66,6 +69,71 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, (state_time & XEN_RUNSTATE_UPDATE)); } +static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, + unsigned int cpu) +{ + int i; + + xen_get_runstate_snapshot_cpu_delta(res, cpu); + + for (i = 0; i < 4; i++) + res->time[i] += per_cpu(old_runstate_time, cpu)[i]; +} + +void xen_manage_runstate_time(int action) +{ + static struct vcpu_runstate_info *runstate_delta; + struct vcpu_runstate_info state; + int cpu, i; + + switch (action) { + case -1: /* backup runstate time before suspend */ + if (unlikely(runstate_delta)) + pr_warn_once("%s: memory leak as runstate_delta is not NULL\n", + __func__); + + runstate_delta = kmalloc_array(num_possible_cpus(), + sizeof(*runstate_delta), + GFP_ATOMIC); + if (unlikely(!runstate_delta)) { + pr_warn("%s: failed to allocate runstate_delta\n", + __func__); + return; + } + + for_each_possible_cpu(cpu) { + xen_get_runstate_snapshot_cpu_delta(&state, cpu); + memcpy(runstate_delta[cpu].time, state.time, + sizeof(runstate_delta[cpu].time)); + } + + break; + + case 0: /* backup runstate time after resume */ + if (unlikely(!runstate_delta)) { + pr_warn("%s: cannot accumulate runstate time as runstate_delta is NULL\n", + __func__); + return; + } + + for_each_possible_cpu(cpu) { + for (i = 0; i < 4; i++) + per_cpu(old_runstate_time, cpu)[i] += + runstate_delta[cpu].time[i]; + } + + break; + + default: /* do not accumulate runstate time for checkpointing */ + break; + } + + if (action != -1 && runstate_delta) { + kfree(runstate_delta); + runstate_delta = NULL; + } +} + /* * Runstate accounting */ diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 218e6aae5433..09072271f122 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb); bool xen_vcpu_stolen(int vcpu); void xen_setup_runstate_info(int cpu); void xen_time_setup_guest(void); +void xen_manage_runstate_time(int action); void xen_get_runstate_snapshot(struct vcpu_runstate_info *res); u64 xen_steal_clock(int cpu); From 5fa916f7ace1d4dd246ac3b5ebbe2800f7e1ac04 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 2 Nov 2017 13:41:07 -0500 Subject: [PATCH 16/34] xen: xenbus_probe_frontend: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 146562 Addresses-Coverity-ID: 146563 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/xenbus/xenbus_probe_frontend.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 19e45ce21f89..07896f4b2736 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -379,10 +379,12 @@ static void xenbus_reset_frontend(char *fe, char *be, int be_state) case XenbusStateConnected: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing); xenbus_reset_wait_for_backend(be, XenbusStateClosing); + /* fall through */ case XenbusStateClosing: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed); xenbus_reset_wait_for_backend(be, XenbusStateClosed); + /* fall through */ case XenbusStateClosed: xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising); From 3d8765d4f52a0050343bfa99a1a2aa5ef7d841e5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 2 Nov 2017 13:51:22 -0500 Subject: [PATCH 17/34] xen/pvcalls-front: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case I placed the "fall through" comment on its own line, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 0c1ec6894cc4..ed94ea0b4b8e 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -1250,7 +1250,8 @@ static void pvcalls_front_changed(struct xenbus_device *dev, case XenbusStateClosed: if (dev->state == XenbusStateClosed) break; - /* Missed the backend's CLOSING state -- fallthrough */ + /* Missed the backend's CLOSING state */ + /* fall through */ case XenbusStateClosing: xenbus_frontend_closed(dev); break; From b5494ad83fb52a8e5a7dc1d30cb42cbca5d617f1 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 2 Nov 2017 18:18:03 -0400 Subject: [PATCH 18/34] xen/time: Return -ENODEV from xen_get_wallclock() For any other error sync_cmos_clock() will reschedule itself every second or so, for no good reason. Suggested-by: Paolo Bonzini Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 1ecb05db3632..244791ff8201 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -74,7 +74,7 @@ static void xen_get_wallclock(struct timespec *now) static int xen_set_wallclock(const struct timespec *now) { - return -1; + return -ENODEV; } static int xen_pvclock_gtod_notify(struct notifier_block *nb, From 95110ac88d5139c73eef6ede37eff23c4089f4f2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 3 Nov 2017 08:42:02 +0000 Subject: [PATCH 19/34] xen/pvcalls: fix unsigned less than zero error check The check on bedata->ref is never true because ref is an unsigned integer. Fix this by assigning signed int ret to the return of the call to gnttab_claim_grant_reference so the -ve return can be checked. Detected by CoverityScan, CID#1460358 ("Unsigned compared against 0") Fixes: 219681909913 ("xen/pvcalls: connect to the backend") Signed-off-by: Colin Ian King Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index ed94ea0b4b8e..60a0ad81d4cf 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -1186,11 +1186,10 @@ static int pvcalls_front_probe(struct xenbus_device *dev, ret = gnttab_alloc_grant_references(1, &gref_head); if (ret < 0) goto error; - bedata->ref = gnttab_claim_grant_reference(&gref_head); - if (bedata->ref < 0) { - ret = bedata->ref; + ret = gnttab_claim_grant_reference(&gref_head); + if (ret < 0) goto error; - } + bedata->ref = ret; gnttab_grant_foreign_access_ref(bedata->ref, dev->otherend_id, virt_to_gfn((void *)sring), 0); From 773aaadcd474d0a4f85915787118891d47b60983 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 3 Nov 2017 09:20:47 +0000 Subject: [PATCH 20/34] xen/pvcalls: remove redundant check for irq >= 0 This is a moot point, but irq is always less than zero at the label out_error, so the check for irq >= 0 is redundant and can be removed. Detected by CoverityScan, CID#1460371 ("Logically dead code") Fixes: cb1c7d9bbc87 ("xen/pvcalls: implement connect command") Signed-off-by: Colin Ian King Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 60a0ad81d4cf..2925b2f095ed 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -351,9 +351,7 @@ static int create_active(struct sock_mapping *map, int *evtchn) return 0; out_error: - if (irq >= 0) - unbind_from_irqhandler(irq, map); - else if (*evtchn >= 0) + if (*evtchn >= 0) xenbus_free_evtchn(pvcalls_front_dev, *evtchn); kfree(map->active.data.in); kfree(map->active.ring); From ec4001c3f29ebb3d4147aaec7be9c687ddadb7c8 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Fri, 3 Nov 2017 17:04:11 +0000 Subject: [PATCH 21/34] xen: support priv-mapping in an HVM tools domain If the domain has XENFEAT_auto_translated_physmap then use of the PV- specific HYPERVISOR_mmu_update hypercall is clearly incorrect. This patch adds checks in xen_remap_domain_gfn_array() and xen_unmap_domain_gfn_array() which call through to the approprate xlate_mmu function if the feature is present. A check is also added to xen_remap_domain_gfn_range() to fail with -EOPNOTSUPP since this should not be used in an HVM tools domain. Signed-off-by: Paul Durrant Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- arch/x86/xen/mmu.c | 14 ++++++++++++-- include/xen/xen-ops.h | 24 ++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3e15345abfe7..d33e7dbe3129 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -172,6 +172,9 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma, pgprot_t prot, unsigned domid, struct page **pages) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -EOPNOTSUPP; + return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages); } EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range); @@ -182,6 +185,10 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma, int *err_ptr, pgprot_t prot, unsigned domid, struct page **pages) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, + prot, domid, pages); + /* We BUG_ON because it's a programmer error to pass a NULL err_ptr, * and the consequences later is quite hard to detect what the actual * cause of "wrong memory was mapped in". @@ -193,9 +200,12 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array); /* Returns: 0 success */ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, - int numpgs, struct page **pages) + int nr, struct page **pages) { - if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_feature(XENFEAT_auto_translated_physmap)) + return xen_xlate_unmap_gfn_range(vma, nr, pages); + + if (!pages) return 0; return -EINVAL; diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 09072271f122..c278b21cad39 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -104,6 +104,8 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma, struct page **pages); int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int numpgs, struct page **pages); + +#ifdef CONFIG_XEN_AUTO_XLATE int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *gfn, int nr, @@ -112,6 +114,28 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, struct page **pages); int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, int nr, struct page **pages); +#else +/* + * These two functions are called from arch/x86/xen/mmu.c and so stubs + * are needed for a configuration not specifying CONFIG_XEN_AUTO_XLATE. + */ +static inline int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, + unsigned long addr, + xen_pfn_t *gfn, int nr, + int *err_ptr, pgprot_t prot, + unsigned int domid, + struct page **pages) +{ + return -EOPNOTSUPP; +} + +static inline int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, + int nr, struct page **pages) +{ + return -EOPNOTSUPP; +} +#endif + int xen_xlate_map_ballooned_pages(xen_pfn_t **pfns, void **vaddr, unsigned long nr_grant_frames); From b988b8ff072ab04abd62d10d3fe44ec544be8a7d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Nov 2017 10:19:17 +0100 Subject: [PATCH 22/34] xen: re-introduce support for grant v2 interface The grant v2 support was removed from the kernel with commit 438b33c7145ca8a5131a30c36d8f59bce119a19a ("xen/grant-table: remove support for V2 tables") as the higher memory footprint of v2 grants resulted in less grants being possible for a kernel compared to the v1 grant interface. As machines with more than 16TB of memory are expected to be more common in the near future support of grant v2 is mandatory in order to be able to run a Xen pv domain at any memory location. So re-add grant v2 support basically by reverting above commit. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- arch/arm/xen/grant-table.c | 9 +- arch/x86/xen/grant-table.c | 62 +++++++- drivers/xen/grant-table.c | 312 ++++++++++++++++++++++++++++++++++++- include/xen/grant_table.h | 30 +++- 4 files changed, 399 insertions(+), 14 deletions(-) diff --git a/arch/arm/xen/grant-table.c b/arch/arm/xen/grant-table.c index e43791829ace..91cf08ba1e95 100644 --- a/arch/arm/xen/grant-table.c +++ b/arch/arm/xen/grant-table.c @@ -45,7 +45,14 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) return; } -int arch_gnttab_init(unsigned long nr_shared) +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared) +{ + return -ENOSYS; +} + +int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) { return 0; } diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 809b6c812654..92ccc718152d 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -49,7 +49,7 @@ static struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; -} gnttab_shared_vm_area; +} gnttab_shared_vm_area, gnttab_status_vm_area; int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, @@ -73,16 +73,43 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, return 0; } -void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared) { + grant_status_t *shared = *__shared; unsigned long addr; unsigned long i; + if (shared == NULL) + *__shared = shared = gnttab_status_vm_area.area->addr; + addr = (unsigned long)shared; for (i = 0; i < nr_gframes; i++) { - set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i], - __pte(0)); + set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i], + mfn_pte(frames[i], PAGE_KERNEL)); + addr += PAGE_SIZE; + } + + return 0; +} + +void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) +{ + pte_t **ptes; + unsigned long addr; + unsigned long i; + + if (shared == gnttab_status_vm_area.area->addr) + ptes = gnttab_status_vm_area.ptes; + else + ptes = gnttab_shared_vm_area.ptes; + + addr = (unsigned long)shared; + + for (i = 0; i < nr_gframes; i++) { + set_pte_at(&init_mm, addr, ptes[i], __pte(0)); addr += PAGE_SIZE; } } @@ -102,12 +129,35 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) return 0; } -int arch_gnttab_init(unsigned long nr_shared) +static void arch_gnttab_vfree(struct gnttab_vm_area *area) { + free_vm_area(area->area); + kfree(area->ptes); +} + +int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) +{ + int ret; + if (!xen_pv_domain()) return 0; - return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); + ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); + if (ret < 0) + return ret; + + /* + * Always allocate the space for the status frames in case + * we're migrated to a host with V2 support. + */ + ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status); + if (ret < 0) + goto err; + + return 0; +err: + arch_gnttab_vfree(&gnttab_shared_vm_area); + return -ENOMEM; } #ifdef CONFIG_XEN_PVH diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 2c6a9114d332..65c4bdb0b463 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -71,6 +71,7 @@ struct grant_frames xen_auto_xlat_grant_frames; static union { struct grant_entry_v1 *v1; + union grant_entry_v2 *v2; void *addr; } gnttab_shared; @@ -121,6 +122,29 @@ struct gnttab_ops { * by bit operations. */ int (*query_foreign_access)(grant_ref_t ref); + /* + * Grant a domain to access a range of bytes within the page referred by + * an available grant entry. Ref parameter is reference of a grant entry + * which will be sub-page accessed, domid is id of grantee domain, frame + * is frame address of subpage grant, flags is grant type and flag + * information, page_off is offset of the range of bytes, and length is + * length of bytes to be accessed. + */ + void (*update_subpage_entry)(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, unsigned length); + /* + * Redirect an available grant entry on domain A to another grant + * reference of domain B, then allow domain C to use grant reference + * of domain B transitively. Ref parameter is an available grant entry + * reference on domain A, domid is id of domain C which accesses grant + * entry transitively, flags is grant type and flag information, + * trans_domid is id of domain B whose grant entry is finally accessed + * transitively, trans_gref is grant entry transitive reference of + * domain B. + */ + void (*update_trans_entry)(grant_ref_t ref, domid_t domid, int flags, + domid_t trans_domid, grant_ref_t trans_gref); }; struct unmap_refs_callback_data { @@ -130,6 +154,9 @@ struct unmap_refs_callback_data { static const struct gnttab_ops *gnttab_interface; +/* This reflects status of grant entries, so act as a global value. */ +static grant_status_t *grstatus; + static int grant_table_version; static int grefs_per_grant_frame; @@ -138,6 +165,7 @@ static struct gnttab_free_callback *gnttab_free_callback_list; static int gnttab_expand(unsigned int req_entries); #define RPP (PAGE_SIZE / sizeof(grant_ref_t)) +#define SPP (PAGE_SIZE / sizeof(grant_status_t)) static inline grant_ref_t *__gnttab_entry(grant_ref_t entry) { @@ -210,7 +238,7 @@ static void put_free_entry(grant_ref_t ref) } /* - * Following applies to gnttab_update_entry_v1. + * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2. * Introducing a valid entry into the grant table: * 1. Write ent->domid. * 2. Write ent->frame: @@ -229,6 +257,15 @@ static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid, gnttab_shared.v1[ref].flags = flags; } +static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid, + unsigned long frame, unsigned int flags) +{ + gnttab_shared.v2[ref].hdr.domid = domid; + gnttab_shared.v2[ref].full_page.frame = frame; + wmb(); /* Hypervisor concurrent accesses. */ + gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags; +} + /* * Public grant-issuing interface functions */ @@ -255,11 +292,132 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); +static void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, unsigned length) +{ + gnttab_shared.v2[ref].sub_page.frame = frame; + gnttab_shared.v2[ref].sub_page.page_off = page_off; + gnttab_shared.v2[ref].sub_page.length = length; + gnttab_shared.v2[ref].hdr.domid = domid; + wmb(); + gnttab_shared.v2[ref].hdr.flags = + GTF_permit_access | GTF_sub_page | flags; +} + +int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, + unsigned length) +{ + if (flags & (GTF_accept_transfer | GTF_reading | + GTF_writing | GTF_transitive)) + return -EPERM; + + if (gnttab_interface->update_subpage_entry == NULL) + return -ENOSYS; + + gnttab_interface->update_subpage_entry(ref, domid, frame, flags, + page_off, length); + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage_ref); + +int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, + int flags, unsigned page_off, + unsigned length) +{ + int ref, rc; + + ref = get_free_entries(1); + if (unlikely(ref < 0)) + return -ENOSPC; + + rc = gnttab_grant_foreign_access_subpage_ref(ref, domid, frame, flags, + page_off, length); + if (rc < 0) { + put_free_entry(ref); + return rc; + } + + return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage); + +bool gnttab_subpage_grants_available(void) +{ + return gnttab_interface->update_subpage_entry != NULL; +} +EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available); + +static void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid, + int flags, domid_t trans_domid, + grant_ref_t trans_gref) +{ + gnttab_shared.v2[ref].transitive.trans_domid = trans_domid; + gnttab_shared.v2[ref].transitive.gref = trans_gref; + gnttab_shared.v2[ref].hdr.domid = domid; + wmb(); + gnttab_shared.v2[ref].hdr.flags = + GTF_permit_access | GTF_transitive | flags; +} + +int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid, + int flags, domid_t trans_domid, + grant_ref_t trans_gref) +{ + if (flags & (GTF_accept_transfer | GTF_reading | + GTF_writing | GTF_sub_page)) + return -EPERM; + + if (gnttab_interface->update_trans_entry == NULL) + return -ENOSYS; + + gnttab_interface->update_trans_entry(ref, domid, flags, trans_domid, + trans_gref); + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans_ref); + +int gnttab_grant_foreign_access_trans(domid_t domid, int flags, + domid_t trans_domid, + grant_ref_t trans_gref) +{ + int ref, rc; + + ref = get_free_entries(1); + if (unlikely(ref < 0)) + return -ENOSPC; + + rc = gnttab_grant_foreign_access_trans_ref(ref, domid, flags, + trans_domid, trans_gref); + if (rc < 0) { + put_free_entry(ref); + return rc; + } + + return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans); + +bool gnttab_trans_grants_available(void) +{ + return gnttab_interface->update_trans_entry != NULL; +} +EXPORT_SYMBOL_GPL(gnttab_trans_grants_available); + static int gnttab_query_foreign_access_v1(grant_ref_t ref) { return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); } +static int gnttab_query_foreign_access_v2(grant_ref_t ref) +{ + return grstatus[ref] & (GTF_reading|GTF_writing); +} + int gnttab_query_foreign_access(grant_ref_t ref) { return gnttab_interface->query_foreign_access(ref); @@ -282,6 +440,29 @@ static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) return 1; } +static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly) +{ + gnttab_shared.v2[ref].hdr.flags = 0; + mb(); /* Concurrent access by hypervisor. */ + if (grstatus[ref] & (GTF_reading|GTF_writing)) { + return 0; + } else { + /* + * The read of grstatus needs to have acquire semantics. + * On x86, reads already have that, and we just need to + * protect against compiler reorderings. + * On other architectures we may need a full barrier. + */ +#ifdef CONFIG_X86 + barrier(); +#else + mb(); +#endif + } + + return 1; +} + static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) { return gnttab_interface->end_foreign_access_ref(ref, readonly); @@ -442,6 +623,37 @@ static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref) return frame; } +static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref) +{ + unsigned long frame; + u16 flags; + u16 *pflags; + + pflags = &gnttab_shared.v2[ref].hdr.flags; + + /* + * If a transfer is not even yet started, try to reclaim the grant + * reference and return failure (== 0). + */ + while (!((flags = *pflags) & GTF_transfer_committed)) { + if (sync_cmpxchg(pflags, flags, 0) == flags) + return 0; + cpu_relax(); + } + + /* If a transfer is in progress then wait until it is completed. */ + while (!(flags & GTF_transfer_completed)) { + flags = *pflags; + cpu_relax(); + } + + rmb(); /* Read the frame number /after/ reading completion status. */ + frame = gnttab_shared.v2[ref].full_page.frame; + BUG_ON(frame == 0); + + return frame; +} + unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) { return gnttab_interface->end_foreign_transfer_ref(ref); @@ -938,6 +1150,12 @@ int gnttab_unmap_refs_sync(struct gntab_unmap_queue_data *item) } EXPORT_SYMBOL_GPL(gnttab_unmap_refs_sync); +static unsigned int nr_status_frames(unsigned int nr_grant_frames) +{ + BUG_ON(grefs_per_grant_frame == 0); + return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP; +} + static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) { int rc; @@ -955,6 +1173,55 @@ static void gnttab_unmap_frames_v1(void) arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); } +static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes) +{ + uint64_t *sframes; + unsigned int nr_sframes; + struct gnttab_get_status_frames getframes; + int rc; + + nr_sframes = nr_status_frames(nr_gframes); + + /* No need for kzalloc as it is initialized in following hypercall + * GNTTABOP_get_status_frames. + */ + sframes = kmalloc_array(nr_sframes, sizeof(uint64_t), GFP_ATOMIC); + if (!sframes) + return -ENOMEM; + + getframes.dom = DOMID_SELF; + getframes.nr_frames = nr_sframes; + set_xen_guest_handle(getframes.frame_list, sframes); + + rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames, + &getframes, 1); + if (rc == -ENOSYS) { + kfree(sframes); + return -ENOSYS; + } + + BUG_ON(rc || getframes.status); + + rc = arch_gnttab_map_status(sframes, nr_sframes, + nr_status_frames(gnttab_max_grant_frames()), + &grstatus); + BUG_ON(rc); + kfree(sframes); + + rc = arch_gnttab_map_shared(frames, nr_gframes, + gnttab_max_grant_frames(), + &gnttab_shared.addr); + BUG_ON(rc); + + return 0; +} + +static void gnttab_unmap_frames_v2(void) +{ + arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); + arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames)); +} + static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; @@ -1022,13 +1289,45 @@ static const struct gnttab_ops gnttab_v1_ops = { .query_foreign_access = gnttab_query_foreign_access_v1, }; +static const struct gnttab_ops gnttab_v2_ops = { + .map_frames = gnttab_map_frames_v2, + .unmap_frames = gnttab_unmap_frames_v2, + .update_entry = gnttab_update_entry_v2, + .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, + .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, + .query_foreign_access = gnttab_query_foreign_access_v2, + .update_subpage_entry = gnttab_update_subpage_entry_v2, + .update_trans_entry = gnttab_update_trans_entry_v2, +}; + static void gnttab_request_version(void) { - /* Only version 1 is used, which will always be available. */ - grant_table_version = 1; - grefs_per_grant_frame = XEN_PAGE_SIZE / sizeof(struct grant_entry_v1); - gnttab_interface = &gnttab_v1_ops; + int rc; + struct gnttab_set_version gsv; + gsv.version = 1; + + rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); + if (rc == 0 && gsv.version == 2) { + grant_table_version = 2; + grefs_per_grant_frame = XEN_PAGE_SIZE / + sizeof(union grant_entry_v2); + gnttab_interface = &gnttab_v2_ops; + } else if (grant_table_version == 2) { + /* + * If we've already used version 2 features, + * but then suddenly discover that they're not + * available (e.g. migrating to an older + * version of Xen), almost unbounded badness + * can happen. + */ + panic("we need grant tables version 2, but only version 1 is available"); + } else { + grant_table_version = 1; + grefs_per_grant_frame = XEN_PAGE_SIZE / + sizeof(struct grant_entry_v1); + gnttab_interface = &gnttab_v1_ops; + } pr_info("Grant tables using version %d layout\n", grant_table_version); } @@ -1122,7 +1421,8 @@ int gnttab_init(void) } } - ret = arch_gnttab_init(max_nr_grant_frames); + ret = arch_gnttab_init(max_nr_grant_frames, + nr_status_frames(max_nr_grant_frames)); if (ret < 0) goto ini_nomem; diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 34b1379f9777..dd6c7a32ee32 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -84,6 +84,24 @@ int gnttab_resume(void); int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly); +int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, + int flags, unsigned page_off, + unsigned length); +int gnttab_grant_foreign_access_trans(domid_t domid, int flags, + domid_t trans_domid, + grant_ref_t trans_gref); + +/* + * Are sub-page grants available on this version of Xen? Returns true if they + * are, and false if they're not. + */ +bool gnttab_subpage_grants_available(void); + +/* + * Are transitive grants available on this version of Xen? Returns true if they + * are, and false if they're not. + */ +bool gnttab_trans_grants_available(void); /* * End access through the given grant reference, iff the grant entry is no @@ -130,6 +148,13 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); +int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags, + unsigned page_off, + unsigned length); +int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid, + int flags, domid_t trans_domid, + grant_ref_t trans_gref); /* Give access to the first 4K of the page */ static inline void gnttab_page_grant_foreign_access_ref_one( @@ -174,10 +199,13 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, unmap->dev_bus_addr = 0; } -int arch_gnttab_init(unsigned long nr_shared); +int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status); int arch_gnttab_map_shared(xen_pfn_t *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, void **__shared); +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared); void arch_gnttab_unmap(void *shared, unsigned long nr_gframes); struct grant_frames { From 56c9c700c4399858e50971d98ac44b5842b06a87 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Nov 2017 10:19:18 +0100 Subject: [PATCH 23/34] xen: limit grant v2 interface to the v1 functionality As there is currently no user for sub-page grants or transient grants remove that functionality. This at once makes it possible to switch from grant v2 to grant v1 without restrictions, as there is no loss of functionality other than the limited frame number width related to the switch. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/grant-table.c | 150 -------------------------------------- include/xen/grant_table.h | 25 ------- 2 files changed, 175 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 65c4bdb0b463..db6a1b9efd11 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -122,29 +122,6 @@ struct gnttab_ops { * by bit operations. */ int (*query_foreign_access)(grant_ref_t ref); - /* - * Grant a domain to access a range of bytes within the page referred by - * an available grant entry. Ref parameter is reference of a grant entry - * which will be sub-page accessed, domid is id of grantee domain, frame - * is frame address of subpage grant, flags is grant type and flag - * information, page_off is offset of the range of bytes, and length is - * length of bytes to be accessed. - */ - void (*update_subpage_entry)(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, unsigned length); - /* - * Redirect an available grant entry on domain A to another grant - * reference of domain B, then allow domain C to use grant reference - * of domain B transitively. Ref parameter is an available grant entry - * reference on domain A, domid is id of domain C which accesses grant - * entry transitively, flags is grant type and flag information, - * trans_domid is id of domain B whose grant entry is finally accessed - * transitively, trans_gref is grant entry transitive reference of - * domain B. - */ - void (*update_trans_entry)(grant_ref_t ref, domid_t domid, int flags, - domid_t trans_domid, grant_ref_t trans_gref); }; struct unmap_refs_callback_data { @@ -292,122 +269,6 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -static void gnttab_update_subpage_entry_v2(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, unsigned length) -{ - gnttab_shared.v2[ref].sub_page.frame = frame; - gnttab_shared.v2[ref].sub_page.page_off = page_off; - gnttab_shared.v2[ref].sub_page.length = length; - gnttab_shared.v2[ref].hdr.domid = domid; - wmb(); - gnttab_shared.v2[ref].hdr.flags = - GTF_permit_access | GTF_sub_page | flags; -} - -int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, - unsigned length) -{ - if (flags & (GTF_accept_transfer | GTF_reading | - GTF_writing | GTF_transitive)) - return -EPERM; - - if (gnttab_interface->update_subpage_entry == NULL) - return -ENOSYS; - - gnttab_interface->update_subpage_entry(ref, domid, frame, flags, - page_off, length); - - return 0; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage_ref); - -int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, - int flags, unsigned page_off, - unsigned length) -{ - int ref, rc; - - ref = get_free_entries(1); - if (unlikely(ref < 0)) - return -ENOSPC; - - rc = gnttab_grant_foreign_access_subpage_ref(ref, domid, frame, flags, - page_off, length); - if (rc < 0) { - put_free_entry(ref); - return rc; - } - - return ref; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_subpage); - -bool gnttab_subpage_grants_available(void) -{ - return gnttab_interface->update_subpage_entry != NULL; -} -EXPORT_SYMBOL_GPL(gnttab_subpage_grants_available); - -static void gnttab_update_trans_entry_v2(grant_ref_t ref, domid_t domid, - int flags, domid_t trans_domid, - grant_ref_t trans_gref) -{ - gnttab_shared.v2[ref].transitive.trans_domid = trans_domid; - gnttab_shared.v2[ref].transitive.gref = trans_gref; - gnttab_shared.v2[ref].hdr.domid = domid; - wmb(); - gnttab_shared.v2[ref].hdr.flags = - GTF_permit_access | GTF_transitive | flags; -} - -int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid, - int flags, domid_t trans_domid, - grant_ref_t trans_gref) -{ - if (flags & (GTF_accept_transfer | GTF_reading | - GTF_writing | GTF_sub_page)) - return -EPERM; - - if (gnttab_interface->update_trans_entry == NULL) - return -ENOSYS; - - gnttab_interface->update_trans_entry(ref, domid, flags, trans_domid, - trans_gref); - - return 0; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans_ref); - -int gnttab_grant_foreign_access_trans(domid_t domid, int flags, - domid_t trans_domid, - grant_ref_t trans_gref) -{ - int ref, rc; - - ref = get_free_entries(1); - if (unlikely(ref < 0)) - return -ENOSPC; - - rc = gnttab_grant_foreign_access_trans_ref(ref, domid, flags, - trans_domid, trans_gref); - if (rc < 0) { - put_free_entry(ref); - return rc; - } - - return ref; -} -EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_trans); - -bool gnttab_trans_grants_available(void) -{ - return gnttab_interface->update_trans_entry != NULL; -} -EXPORT_SYMBOL_GPL(gnttab_trans_grants_available); - static int gnttab_query_foreign_access_v1(grant_ref_t ref) { return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); @@ -1296,8 +1157,6 @@ static const struct gnttab_ops gnttab_v2_ops = { .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, .query_foreign_access = gnttab_query_foreign_access_v2, - .update_subpage_entry = gnttab_update_subpage_entry_v2, - .update_trans_entry = gnttab_update_trans_entry_v2, }; static void gnttab_request_version(void) @@ -1313,15 +1172,6 @@ static void gnttab_request_version(void) grefs_per_grant_frame = XEN_PAGE_SIZE / sizeof(union grant_entry_v2); gnttab_interface = &gnttab_v2_ops; - } else if (grant_table_version == 2) { - /* - * If we've already used version 2 features, - * but then suddenly discover that they're not - * available (e.g. migrating to an older - * version of Xen), almost unbounded badness - * can happen. - */ - panic("we need grant tables version 2, but only version 1 is available"); } else { grant_table_version = 1; grefs_per_grant_frame = XEN_PAGE_SIZE / diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index dd6c7a32ee32..2e37741f6b8d 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -84,24 +84,6 @@ int gnttab_resume(void); int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly); -int gnttab_grant_foreign_access_subpage(domid_t domid, unsigned long frame, - int flags, unsigned page_off, - unsigned length); -int gnttab_grant_foreign_access_trans(domid_t domid, int flags, - domid_t trans_domid, - grant_ref_t trans_gref); - -/* - * Are sub-page grants available on this version of Xen? Returns true if they - * are, and false if they're not. - */ -bool gnttab_subpage_grants_available(void); - -/* - * Are transitive grants available on this version of Xen? Returns true if they - * are, and false if they're not. - */ -bool gnttab_trans_grants_available(void); /* * End access through the given grant reference, iff the grant entry is no @@ -148,13 +130,6 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); -int gnttab_grant_foreign_access_subpage_ref(grant_ref_t ref, domid_t domid, - unsigned long frame, int flags, - unsigned page_off, - unsigned length); -int gnttab_grant_foreign_access_trans_ref(grant_ref_t ref, domid_t domid, - int flags, domid_t trans_domid, - grant_ref_t trans_gref); /* Give access to the first 4K of the page */ static inline void gnttab_page_grant_foreign_access_ref_one( From 83c69324f43816f8259d1a361062a974f4bdb86d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Nov 2017 10:19:19 +0100 Subject: [PATCH 24/34] xen: add grant interface version dependent constants to gnttab_ops Instead of having multiple variables with constants like grant_table_version or grefs_per_grant_frame add those to struct gnttab_ops and access them just via the gnttab_interface pointer. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/grant-table.c | 73 +++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index db6a1b9efd11..573e7fe9b147 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -77,6 +77,14 @@ static union { /*This is a structure of function pointers for grant table*/ struct gnttab_ops { + /* + * Version of the grant interface. + */ + unsigned int version; + /* + * Grant refs per grant frame. + */ + unsigned int grefs_per_grant_frame; /* * Mapping a list of frames for storing grant entries. Frames parameter * is used to store grant table address when grant table being setup, @@ -134,9 +142,6 @@ static const struct gnttab_ops *gnttab_interface; /* This reflects status of grant entries, so act as a global value. */ static grant_status_t *grstatus; -static int grant_table_version; -static int grefs_per_grant_frame; - static struct gnttab_free_callback *gnttab_free_callback_list; static int gnttab_expand(unsigned int req_entries); @@ -636,19 +641,26 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback) } EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback); +static unsigned int gnttab_frames(unsigned int frames, unsigned int align) +{ + return (frames * gnttab_interface->grefs_per_grant_frame + align - 1) / + align; +} + static int grow_gnttab_list(unsigned int more_frames) { unsigned int new_nr_grant_frames, extra_entries, i; unsigned int nr_glist_frames, new_nr_glist_frames; + unsigned int grefs_per_frame; - BUG_ON(grefs_per_grant_frame == 0); + BUG_ON(gnttab_interface == NULL); + grefs_per_frame = gnttab_interface->grefs_per_grant_frame; new_nr_grant_frames = nr_grant_frames + more_frames; - extra_entries = more_frames * grefs_per_grant_frame; + extra_entries = more_frames * grefs_per_frame; - nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; - new_nr_glist_frames = - (new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; + nr_glist_frames = gnttab_frames(nr_grant_frames, RPP); + new_nr_glist_frames = gnttab_frames(new_nr_grant_frames, RPP); for (i = nr_glist_frames; i < new_nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); if (!gnttab_list[i]) @@ -656,12 +668,12 @@ static int grow_gnttab_list(unsigned int more_frames) } - for (i = grefs_per_grant_frame * nr_grant_frames; - i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++) + for (i = grefs_per_frame * nr_grant_frames; + i < grefs_per_frame * new_nr_grant_frames - 1; i++) gnttab_entry(i) = i + 1; gnttab_entry(i) = gnttab_free_head; - gnttab_free_head = grefs_per_grant_frame * nr_grant_frames; + gnttab_free_head = grefs_per_frame * nr_grant_frames; gnttab_free_count += extra_entries; nr_grant_frames = new_nr_grant_frames; @@ -1013,8 +1025,8 @@ EXPORT_SYMBOL_GPL(gnttab_unmap_refs_sync); static unsigned int nr_status_frames(unsigned int nr_grant_frames) { - BUG_ON(grefs_per_grant_frame == 0); - return (nr_grant_frames * grefs_per_grant_frame + SPP - 1) / SPP; + BUG_ON(gnttab_interface == NULL); + return gnttab_frames(nr_grant_frames, SPP); } static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) @@ -1142,6 +1154,9 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) } static const struct gnttab_ops gnttab_v1_ops = { + .version = 1, + .grefs_per_grant_frame = XEN_PAGE_SIZE / + sizeof(struct grant_entry_v1), .map_frames = gnttab_map_frames_v1, .unmap_frames = gnttab_unmap_frames_v1, .update_entry = gnttab_update_entry_v1, @@ -1151,6 +1166,9 @@ static const struct gnttab_ops gnttab_v1_ops = { }; static const struct gnttab_ops gnttab_v2_ops = { + .version = 2, + .grefs_per_grant_frame = XEN_PAGE_SIZE / + sizeof(union grant_entry_v2), .map_frames = gnttab_map_frames_v2, .unmap_frames = gnttab_unmap_frames_v2, .update_entry = gnttab_update_entry_v2, @@ -1167,18 +1185,12 @@ static void gnttab_request_version(void) gsv.version = 1; rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); - if (rc == 0 && gsv.version == 2) { - grant_table_version = 2; - grefs_per_grant_frame = XEN_PAGE_SIZE / - sizeof(union grant_entry_v2); + if (rc == 0 && gsv.version == 2) gnttab_interface = &gnttab_v2_ops; - } else { - grant_table_version = 1; - grefs_per_grant_frame = XEN_PAGE_SIZE / - sizeof(struct grant_entry_v1); + else gnttab_interface = &gnttab_v1_ops; - } - pr_info("Grant tables using version %d layout\n", grant_table_version); + pr_info("Grant tables using version %d layout\n", + gnttab_interface->version); } static int gnttab_setup(void) @@ -1218,10 +1230,10 @@ static int gnttab_expand(unsigned int req_entries) int rc; unsigned int cur, extra; - BUG_ON(grefs_per_grant_frame == 0); + BUG_ON(gnttab_interface == NULL); cur = nr_grant_frames; - extra = ((req_entries + (grefs_per_grant_frame-1)) / - grefs_per_grant_frame); + extra = ((req_entries + gnttab_interface->grefs_per_grant_frame - 1) / + gnttab_interface->grefs_per_grant_frame); if (cur + extra > gnttab_max_grant_frames()) { pr_warn_ratelimited("xen/grant-table: max_grant_frames reached" " cur=%u extra=%u limit=%u" @@ -1253,16 +1265,16 @@ int gnttab_init(void) /* Determine the maximum number of frames required for the * grant reference free list on the current hypervisor. */ - BUG_ON(grefs_per_grant_frame == 0); + BUG_ON(gnttab_interface == NULL); max_nr_glist_frames = (max_nr_grant_frames * - grefs_per_grant_frame / RPP); + gnttab_interface->grefs_per_grant_frame / RPP); gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), GFP_KERNEL); if (gnttab_list == NULL) return -ENOMEM; - nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; + nr_glist_frames = gnttab_frames(nr_grant_frames, RPP); for (i = 0; i < nr_glist_frames; i++) { gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); if (gnttab_list[i] == NULL) { @@ -1281,7 +1293,8 @@ int gnttab_init(void) goto ini_nomem; } - nr_init_grefs = nr_grant_frames * grefs_per_grant_frame; + nr_init_grefs = nr_grant_frames * + gnttab_interface->grefs_per_grant_frame; for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) gnttab_entry(i) = i + 1; From 223c8f3349f9ce52847dfa266b32761ac223b9ac Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Nov 2017 10:19:20 +0100 Subject: [PATCH 25/34] xen: update arch/x86/include/asm/xen/cpuid.h Update arch/x86/include/asm/xen/cpuid.h from the Xen tree to get newest definitions. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- arch/x86/include/asm/xen/cpuid.h | 44 ++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h index 3bdd10d71223..a9630104f1c4 100644 --- a/arch/x86/include/asm/xen/cpuid.h +++ b/arch/x86/include/asm/xen/cpuid.h @@ -74,21 +74,43 @@ #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0) /* - * Leaf 5 (0x40000x04) - * HVM-specific features - * EAX: Features - * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) + * Leaf 4 (0x40000x03) + * Sub-leaf 0: EAX: bit 0: emulated tsc + * bit 1: host tsc is known to be reliable + * bit 2: RDTSCP instruction available + * EBX: tsc_mode: 0=default (emulate if necessary), 1=emulate, + * 2=no emulation, 3=no emulation + TSC_AUX support + * ECX: guest tsc frequency in kHz + * EDX: guest tsc incarnation (migration count) + * Sub-leaf 1: EAX: tsc offset low part + * EBX: tsc offset high part + * ECX: multiplicator for tsc->ns conversion + * EDX: shift amount for tsc->ns conversion + * Sub-leaf 2: EAX: host tsc frequency in kHz */ -/* Virtualized APIC registers */ -#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) -/* Virtualized x2APIC accesses */ -#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) +/* + * Leaf 5 (0x40000x04) + * HVM-specific features + * Sub-leaf 0: EAX: Features + * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) + */ +#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */ +#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */ /* Memory mapped from other domains has valid IOMMU entries */ #define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) -/* vcpu id is present in EBX */ -#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) +#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */ -#define XEN_CPUID_MAX_NUM_LEAVES 4 +/* + * Leaf 6 (0x40000x05) + * PV-specific parameters + * Sub-leaf 0: EAX: max available sub-leaf + * Sub-leaf 0: EBX: bits 0-7: max machine address width + */ + +/* Max. address width in bits taking memory hotplug into account. */ +#define XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK (0xffu << 0) + +#define XEN_CPUID_MAX_NUM_LEAVES 5 #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */ From 8dca4d96c7d8058e7c97daad9ae0a76569ea31e6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 2 Nov 2017 10:19:21 +0100 Subject: [PATCH 26/34] xen: select grant interface version Grant v2 will be needed in cases where a frame number in the grant table can exceed 32 bits. For PV guests this is a host feature, while for HVM guests this is a guest feature. So select grant v2 in case frame numbers can be larger than 32 bits and grant v1 else. For testing purposes add a way to specify the grant interface version via a boot parameter. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/grant-table.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 573e7fe9b147..fa152832e0e6 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -33,6 +33,7 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include #include @@ -52,6 +54,9 @@ #include #include #include +#ifdef CONFIG_X86 +#include +#endif #include #include @@ -68,6 +73,8 @@ static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); struct grant_frames xen_auto_xlat_grant_frames; +static unsigned int xen_gnttab_version; +module_param_named(version, xen_gnttab_version, uint, 0); static union { struct grant_entry_v1 *v1; @@ -1177,12 +1184,36 @@ static const struct gnttab_ops gnttab_v2_ops = { .query_foreign_access = gnttab_query_foreign_access_v2, }; +static bool gnttab_need_v2(void) +{ +#ifdef CONFIG_X86 + uint32_t base, width; + + if (xen_pv_domain()) { + base = xen_cpuid_base(); + if (cpuid_eax(base) < 5) + return false; /* Information not available, use V1. */ + width = cpuid_ebx(base + 5) & + XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK; + return width > 32 + PAGE_SHIFT; + } +#endif + return !!(max_possible_pfn >> 32); +} + static void gnttab_request_version(void) { - int rc; + long rc; struct gnttab_set_version gsv; - gsv.version = 1; + if (gnttab_need_v2()) + gsv.version = 2; + else + gsv.version = 1; + + /* Boot parameter overrides automatic selection. */ + if (xen_gnttab_version >= 1 && xen_gnttab_version <= 2) + gsv.version = xen_gnttab_version; rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1); if (rc == 0 && gsv.version == 2) From 52847bb79b524c6f6c177d3247e824ad7487b31c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 8 Nov 2017 13:00:30 +0000 Subject: [PATCH 27/34] xen/privcmd: remove unused variable pageidx Variable pageidx is assigned a value but it is never read, hence it is redundant and can be removed. Cleans up clang warning: drivers/xen/privcmd.c:199:2: warning: Value stored to 'pageidx' is never read Signed-off-by: Colin Ian King Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/privcmd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index feca75b07fdd..1c909183c42a 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -191,13 +191,10 @@ static int traverse_pages_block(unsigned nelem, size_t size, void *state) { void *pagedata; - unsigned pageidx; int ret = 0; BUG_ON(size > PAGE_SIZE); - pageidx = PAGE_SIZE; - while (nelem) { int nr = (PAGE_SIZE/size); struct page *page; From 001f60e1f662a6dee1630a2915401aaf5959d479 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 8 Nov 2017 17:19:54 +0000 Subject: [PATCH 28/34] ptp_kvm: probe for kvm guest availability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the event of moving pvclock_pvti_cpu0_va() definition to common pvclock code, this function would return a value on non KVM guests. Later on this would fail with a GPF on ptp_kvm_init when running on a Xen guest. Therefore, ptp_kvm_init() should check whether it is running in a KVM guest. Signed-off-by: Joao Martins Acked-by: Radim Krčmář Signed-off-by: Boris Ostrovsky --- drivers/ptp/ptp_kvm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c index 2b1b212c219e..e04d7b2ecb3a 100644 --- a/drivers/ptp/ptp_kvm.c +++ b/drivers/ptp/ptp_kvm.c @@ -178,6 +178,9 @@ static int __init ptp_kvm_init(void) { long ret; + if (!kvm_para_available()) + return -ENODEV; + clock_pair_gpa = slow_virt_to_phys(&clock_pair); hv_clock = pvclock_pvti_cpu0_va(); From 9f08890ab906abaf9d4c1bad8111755cbd302260 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 8 Nov 2017 17:19:55 +0000 Subject: [PATCH 29/34] x86/pvclock: add setter for pvclock_pvti_cpu0_va Right now there is only a pvclock_pvti_cpu0_va() which is defined on kvmclock since: commit dac16fba6fc5 ("x86/vdso: Get pvclock data from the vvar VMA instead of the fixmap") The only user of this interface so far is kvm. This commit adds a setter function for the pvti page and moves pvclock_pvti_cpu0_va to pvclock, which is a more generic place to have it; and would allow other PV clocksources to use it, such as Xen. While moving pvclock_pvti_cpu0_va into pvclock, rename also this function to pvclock_get_pvti_cpu0_va (including its call sites) to be symmetric with the setter (pvclock_set_pvti_cpu0_va). Signed-off-by: Joao Martins Acked-by: Andy Lutomirski Acked-by: Paolo Bonzini Acked-by: Thomas Gleixner Signed-off-by: Boris Ostrovsky --- arch/x86/entry/vdso/vma.c | 2 +- arch/x86/include/asm/pvclock.h | 19 ++++++++++--------- arch/x86/kernel/kvmclock.c | 7 +------ arch/x86/kernel/pvclock.c | 14 ++++++++++++++ drivers/ptp/ptp_kvm.c | 2 +- 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1911310959f8..a77fd3c8d824 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -112,7 +112,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = - pvclock_pvti_cpu0_va(); + pvclock_get_pvti_cpu0_va(); if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn( vma, diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 448cfe1b48cf..55325f934d71 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -4,15 +4,6 @@ #include #include -#ifdef CONFIG_KVM_GUEST -extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void); -#else -static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -{ - return NULL; -} -#endif - /* some helper functions for xen and kvm pv clock sources */ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); @@ -101,4 +92,14 @@ struct pvclock_vsyscall_time_info { #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) +#ifdef CONFIG_PARAVIRT_CLOCK +void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti); +struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void); +#else +static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) +{ + return NULL; +} +#endif + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d88967659098..538738047ff5 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -47,12 +47,6 @@ early_param("no-kvmclock", parse_no_kvmclock); static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock wall_clock; -struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -{ - return hv_clock; -} -EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va); - /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for @@ -334,6 +328,7 @@ int __init kvm_setup_vsyscall_timeinfo(void) return 1; } + pvclock_set_pvti_cpu0_va(hv_clock); put_cpu(); kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 5c3f6d6a5078..761f6af6efa5 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -25,8 +25,10 @@ #include #include +#include static u8 valid_flags __read_mostly = 0; +static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly; void pvclock_set_flags(u8 flags) { @@ -144,3 +146,15 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } + +void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) +{ + WARN_ON(vclock_was_used(VCLOCK_PVCLOCK)); + pvti_cpu0_va = pvti; +} + +struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) +{ + return pvti_cpu0_va; +} +EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va); diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c index e04d7b2ecb3a..c67dd11e08b1 100644 --- a/drivers/ptp/ptp_kvm.c +++ b/drivers/ptp/ptp_kvm.c @@ -182,7 +182,7 @@ static int __init ptp_kvm_init(void) return -ENODEV; clock_pair_gpa = slow_virt_to_phys(&clock_pair); - hv_clock = pvclock_pvti_cpu0_va(); + hv_clock = pvclock_get_pvti_cpu0_va(); if (!hv_clock) return -ENODEV; From b888808093113ae7d63d213272d01fea4b8329ed Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 8 Nov 2017 17:19:56 +0000 Subject: [PATCH 30/34] x86/xen/time: set pvclock flags on xen_time_init() Specifically check for PVCLOCK_TSC_STABLE_BIT and if this bit is set, then set it too on pvclock flags. This allows Xen clocksource to use it and thus speeding up xen_clocksource_read() callers (i.e. sched_clock()) Signed-off-by: Joao Martins Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- arch/x86/xen/time.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 244791ff8201..c96e61fd70e7 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -372,6 +372,7 @@ static const struct pv_time_ops xen_time_ops __initconst = { static void __init xen_time_init(void) { + struct pvclock_vcpu_time_info *pvti; int cpu = smp_processor_id(); struct timespec tp; @@ -395,6 +396,14 @@ static void __init xen_time_init(void) setup_force_cpu_cap(X86_FEATURE_TSC); + /* + * We check ahead on the primary time info if this + * bit is supported hence speeding up Xen clocksource. + */ + pvti = &__this_cpu_read(xen_vcpu)->time; + if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); From 2229f70b5bbb025e1394b61007938a68060afbfb Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 8 Nov 2017 17:19:57 +0000 Subject: [PATCH 31/34] x86/xen/time: setup vcpu 0 time info page In order to support pvclock vdso on xen we need to setup the time info page for vcpu 0 and register the page with Xen using the VCPUOP_register_vcpu_time_memory_area hypercall. This hypercall will also forcefully update the pvti which will set some of the necessary flags for vdso. Afterwards we check if it supports the PVCLOCK_TSC_STABLE_BIT flag which is mandatory for having vdso/vsyscall support. And if so, it will set the cpu 0 pvti that will be later on used when mapping the vdso image. The xen headers are also updated to include the new hypercall for registering the secondary vcpu_time_info struct. Signed-off-by: Joao Martins Reviewed-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- arch/x86/xen/suspend.c | 4 ++ arch/x86/xen/time.c | 90 +++++++++++++++++++++++++++++++++++- arch/x86/xen/xen-ops.h | 2 + include/xen/interface/vcpu.h | 42 +++++++++++++++++ 4 files changed, 137 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index d6b1680693a9..800ed36ecfba 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -16,6 +16,8 @@ void xen_arch_pre_suspend(void) { + xen_save_time_memory_area(); + if (xen_pv_domain()) xen_pv_pre_suspend(); } @@ -26,6 +28,8 @@ void xen_arch_post_suspend(int cancelled) xen_pv_post_suspend(cancelled); else xen_hvm_post_suspend(cancelled); + + xen_restore_time_memory_area(); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c96e61fd70e7..c2041043c606 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -370,6 +370,92 @@ static const struct pv_time_ops xen_time_ops __initconst = { .steal_clock = xen_steal_clock, }; +static struct pvclock_vsyscall_time_info *xen_clock __read_mostly; + +void xen_save_time_memory_area(void) +{ + struct vcpu_register_time_memory_area t; + int ret; + + if (!xen_clock) + return; + + t.addr.v = NULL; + + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); + if (ret != 0) + pr_notice("Cannot save secondary vcpu_time_info (err %d)", + ret); + else + clear_page(xen_clock); +} + +void xen_restore_time_memory_area(void) +{ + struct vcpu_register_time_memory_area t; + int ret; + + if (!xen_clock) + return; + + t.addr.v = &xen_clock->pvti; + + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); + + /* + * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the + * secondary time info with Xen or if we migrated to a host without the + * necessary flags. On both of these cases what happens is either + * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT + * bit set. Userspace checks the latter and if 0, it discards the data + * in pvti and fallbacks to a system call for a reliable timestamp. + */ + if (ret != 0) + pr_notice("Cannot restore secondary vcpu_time_info (err %d)", + ret); +} + +static void xen_setup_vsyscall_time_info(void) +{ + struct vcpu_register_time_memory_area t; + struct pvclock_vsyscall_time_info *ti; + int ret; + + ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL); + if (!ti) + return; + + t.addr.v = &ti->pvti; + + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); + if (ret) { + pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret); + free_page((unsigned long)ti); + return; + } + + /* + * If primary time info had this bit set, secondary should too since + * it's the same data on both just different memory regions. But we + * still check it in case hypervisor is buggy. + */ + if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) { + t.addr.v = NULL; + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, + 0, &t); + if (!ret) + free_page((unsigned long)ti); + + pr_notice("xen: VCLOCK_PVCLOCK not supported (tsc unstable)\n"); + return; + } + + xen_clock = ti; + pvclock_set_pvti_cpu0_va(xen_clock); + + xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK; +} + static void __init xen_time_init(void) { struct pvclock_vcpu_time_info *pvti; @@ -401,8 +487,10 @@ static void __init xen_time_init(void) * bit is supported hence speeding up Xen clocksource. */ pvti = &__this_cpu_read(xen_vcpu)->time; - if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) + if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) { pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + xen_setup_vsyscall_time_info(); + } xen_setup_runstate_info(cpu); xen_setup_timer(cpu); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index c8a6d224f7ed..f96dbedb33d4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -69,6 +69,8 @@ void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); +void xen_save_time_memory_area(void); +void xen_restore_time_memory_area(void); void __init xen_init_time_ops(void); void __init xen_hvm_init_time_ops(void); diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h index 98188c87f5c1..504c71601511 100644 --- a/include/xen/interface/vcpu.h +++ b/include/xen/interface/vcpu.h @@ -178,4 +178,46 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info); /* Send an NMI to the specified VCPU. @extra_arg == NULL. */ #define VCPUOP_send_nmi 11 + +/* + * Get the physical ID information for a pinned vcpu's underlying physical + * processor. The physical ID informmation is architecture-specific. + * On x86: id[31:0]=apic_id, id[63:32]=acpi_id. + * This command returns -EINVAL if it is not a valid operation for this VCPU. + */ +#define VCPUOP_get_physid 12 /* arg == vcpu_get_physid_t */ +struct vcpu_get_physid { + uint64_t phys_id; +}; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_get_physid); +#define xen_vcpu_physid_to_x86_apicid(physid) ((uint32_t)(physid)) +#define xen_vcpu_physid_to_x86_acpiid(physid) ((uint32_t)((physid) >> 32)) + +/* + * Register a memory location to get a secondary copy of the vcpu time + * parameters. The master copy still exists as part of the vcpu shared + * memory area, and this secondary copy is updated whenever the master copy + * is updated (and using the same versioning scheme for synchronisation). + * + * The intent is that this copy may be mapped (RO) into userspace so + * that usermode can compute system time using the time info and the + * tsc. Usermode will see an array of vcpu_time_info structures, one + * for each vcpu, and choose the right one by an existing mechanism + * which allows it to get the current vcpu number (such as via a + * segment limit). It can then apply the normal algorithm to compute + * system time from the tsc. + * + * @extra_arg == pointer to vcpu_register_time_info_memory_area structure. + */ +#define VCPUOP_register_vcpu_time_memory_area 13 +DEFINE_GUEST_HANDLE_STRUCT(vcpu_time_info); +struct vcpu_register_time_memory_area { + union { + GUEST_HANDLE(vcpu_time_info) h; + struct pvclock_vcpu_time_info *v; + uint64_t p; + } addr; +}; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_time_memory_area); + #endif /* __XEN_PUBLIC_VCPU_H__ */ From ffe15f83ef856ba451e99c41b07d46763bd72a0f Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 8 Nov 2017 17:19:58 +0000 Subject: [PATCH 32/34] MAINTAINERS: xen, kvm: track pvclock-abi.h changes This file defines an ABI shared between guest and hypervisor(s) (KVM, Xen) and as such there should be an correspondent entry in MAINTAINERS file. Notice that there's already a text notice at the top of the header file, hence this commit simply enforces it more explicitly and have both peers noticed when such changes happen. Signed-off-by: Joao Martins Acked-by: Juergen Gross Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Paolo Bonzini Signed-off-by: Boris Ostrovsky --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index af0cb69f6a3e..ff93f4a44d2e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7604,6 +7604,7 @@ S: Supported F: arch/x86/kvm/ F: arch/x86/include/uapi/asm/kvm* F: arch/x86/include/asm/kvm* +F: arch/x86/include/asm/pvclock-abi.h F: arch/x86/kernel/kvm.c F: arch/x86/kernel/kvmclock.c @@ -14731,6 +14732,7 @@ F: arch/x86/xen/ F: drivers/*/xen-*front.c F: drivers/xen/ F: arch/x86/include/asm/xen/ +F: arch/x86/include/asm/pvclock-abi.h F: include/xen/ F: include/uapi/xen/ F: Documentation/ABI/stable/sysfs-hypervisor-xen From 24e7f84db0db6b6f14581707a45d06ec5d5b88f2 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 15 Nov 2017 11:24:02 -0500 Subject: [PATCH 33/34] xen/pvcalls: Add MODULE_LICENSE() Since commit ba1029c9cbc5 ("modpost: detect modules without a MODULE_LICENSE") modules without said macro will generate WARNING: modpost: missing MODULE_LICENSE() in While at it, also add module description and attribution. Signed-off-by: Boris Ostrovsky Reviewed-by: Juergen Gross Acked-by: Stefano Stabellini --- drivers/xen/pvcalls-back.c | 4 ++++ drivers/xen/pvcalls-front.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c index b209cd44bb8d..02cd33c58204 100644 --- a/drivers/xen/pvcalls-back.c +++ b/drivers/xen/pvcalls-back.c @@ -1238,3 +1238,7 @@ static void __exit pvcalls_back_fin(void) } module_exit(pvcalls_back_fin); + +MODULE_DESCRIPTION("Xen PV Calls backend driver"); +MODULE_AUTHOR("Stefano Stabellini "); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 2925b2f095ed..9e40c2cd70b1 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -1273,3 +1273,7 @@ static int __init pvcalls_frontend_init(void) } module_init(pvcalls_frontend_init); + +MODULE_DESCRIPTION("Xen PV Calls frontend driver"); +MODULE_AUTHOR("Stefano Stabellini "); +MODULE_LICENSE("GPL"); From 646d944c2ef5a3b298c4e150494c71b9272d8b47 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 15 Nov 2017 13:20:21 -0800 Subject: [PATCH 34/34] xen/pvcalls: fix potential endless loop in pvcalls-front.c mutex_trylock() returns 1 if you take the lock and 0 if not. Assume you take in_mutex on the first try, but you can't take out_mutex. Next times you call mutex_trylock() in_mutex is going to fail. It's an endless loop. Solve the problem by waiting until the global refcount is 1 instead (the refcount is 1 when the only active pvcalls frontend function is pvcalls_front_release). Reported-by: Dan Carpenter Signed-off-by: Stefano Stabellini Signed-off-by: Boris Ostrovsky --- drivers/xen/pvcalls-front.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 9e40c2cd70b1..40caa92bff33 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -1041,13 +1041,12 @@ int pvcalls_front_release(struct socket *sock) wake_up_interruptible(&map->active.inflight_conn_req); /* - * Wait until there are no more waiters on the mutexes. - * We know that no new waiters can be added because sk_send_head - * is set to NULL -- we only need to wait for the existing - * waiters to return. + * We need to make sure that sendmsg/recvmsg on this socket have + * not started before we've cleared sk_send_head here. The + * easiest (though not optimal) way to guarantee this is to see + * that no pvcall (other than us) is in progress. */ - while (!mutex_trylock(&map->active.in_mutex) || - !mutex_trylock(&map->active.out_mutex)) + while (atomic_read(&pvcalls_refcount) > 1) cpu_relax(); pvcalls_front_free_map(bedata, map);