From 64d098886e0ec01f88349fe757161c2e2e89086b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Apr 2012 10:11:12 -0400 Subject: [PATCH 01/10] virtio/tools: add delayed interupt mode Signed-off-by: Michael S. Tsirkin --- tools/virtio/linux/virtio.h | 1 + tools/virtio/virtio_test.c | 26 ++++++++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 7579f19e61e0..81847dd08bd0 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -203,6 +203,7 @@ void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); void virtqueue_disable_cb(struct virtqueue *vq); bool virtqueue_enable_cb(struct virtqueue *vq); +bool virtqueue_enable_cb_delayed(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq); struct virtqueue *vring_new_virtqueue(unsigned int num, diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index 6bf95f995364..e626fa553c5a 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -144,7 +144,8 @@ static void wait_for_interrupt(struct vdev_info *dev) } } -static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs) +static void run_test(struct vdev_info *dev, struct vq_info *vq, + bool delayed, int bufs) { struct scatterlist sl; long started = 0, completed = 0; @@ -183,8 +184,12 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, int bufs) assert(started <= bufs); if (completed == bufs) break; - if (virtqueue_enable_cb(vq->vq)) { - wait_for_interrupt(dev); + if (delayed) { + if (virtqueue_enable_cb_delayed(vq->vq)) + wait_for_interrupt(dev); + } else { + if (virtqueue_enable_cb(vq->vq)) + wait_for_interrupt(dev); } } test = 0; @@ -215,6 +220,14 @@ const struct option longopts[] = { .name = "no-indirect", .val = 'i', }, + { + .name = "delayed-interrupt", + .val = 'D', + }, + { + .name = "no-delayed-interrupt", + .val = 'd', + }, { } }; @@ -224,6 +237,7 @@ static void help() fprintf(stderr, "Usage: virtio_test [--help]" " [--no-indirect]" " [--no-event-idx]" + " [--delayed-interrupt]" "\n"); } @@ -233,6 +247,7 @@ int main(int argc, char **argv) unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | (1ULL << VIRTIO_RING_F_EVENT_IDX); int o; + bool delayed = false; for (;;) { o = getopt_long(argc, argv, optstring, longopts, NULL); @@ -251,6 +266,9 @@ int main(int argc, char **argv) case 'i': features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC); break; + case 'D': + delayed = true; + break; default: assert(0); break; @@ -260,6 +278,6 @@ int main(int argc, char **argv) done: vdev_info_init(&dev, features); vq_info_add(&dev, 256); - run_test(&dev, &dev.vqs[0], 0x100000); + run_test(&dev, &dev.vqs[0], delayed, 0x100000); return 0; } From 3afc9621f15701c557e60f61eba9242bac2771dd Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:41:30 +0800 Subject: [PATCH 02/10] macvtap: zerocopy: fix offset calculation when building skb This patch fixes the offset calculation when building skb: - offset1 were used as skb data offset not vector offset - reset offset to zero only when we advance to next vector Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/net/macvtap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 0427c6561c84..bd4a70d8605f 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -505,10 +505,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, if (copy > size) { ++from; --count; - } + offset = 0; + } else + offset += size; copy -= size; offset1 += size; - offset = 0; } if (len == offset1) @@ -519,13 +520,13 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, int num_pages; unsigned long base; - len = from->iov_len - offset1; + len = from->iov_len - offset; if (!len) { - offset1 = 0; + offset = 0; ++from; continue; } - base = (unsigned long)from->iov_base + offset1; + base = (unsigned long)from->iov_base + offset; size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; num_pages = get_user_pages_fast(base, size, 0, &page[i]); if ((num_pages != size) || @@ -546,7 +547,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, len -= size; i++; } - offset1 = 0; + offset = 0; ++from; } return 0; From 4ef67ebedffa44ed9939b34708ac2fee06d2f65f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:41:44 +0800 Subject: [PATCH 03/10] macvtap: zerocopy: fix truesize underestimation As the skb fragment were pinned/built from user pages, we should account the page instead of length for truesize. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/net/macvtap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index bd4a70d8605f..7cb2684a8792 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -519,6 +519,7 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, struct page *page[MAX_SKB_FRAGS]; int num_pages; unsigned long base; + unsigned long truesize; len = from->iov_len - offset; if (!len) { @@ -533,10 +534,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) /* put_page is in skb free */ return -EFAULT; + truesize = size * PAGE_SIZE; skb->data_len += len; skb->len += len; - skb->truesize += len; - atomic_add(len, &skb->sk->sk_wmem_alloc); + skb->truesize += truesize; + atomic_add(truesize, &skb->sk->sk_wmem_alloc); while (len) { int off = base & ~PAGE_MASK; int size = min_t(int, len, PAGE_SIZE - off); From 02ce04bb3d28c3333231f43bca677228dbc686fe Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:41:58 +0800 Subject: [PATCH 04/10] macvtap: zerocopy: put page when fail to get all requested user pages When get_user_pages_fast() fails to get all requested pages, we could not use kfree_skb() to free it as it has not been put in the skb fragments. So we need to call put_page() instead. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/net/macvtap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 7cb2684a8792..9ab182a40648 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -531,9 +531,11 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; num_pages = get_user_pages_fast(base, size, 0, &page[i]); if ((num_pages != size) || - (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) - /* put_page is in skb free */ + (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) { + for (i = 0; i < num_pages; i++) + put_page(page[i]); return -EFAULT; + } truesize = size * PAGE_SIZE; skb->data_len += len; skb->len += len; From 01d6657b388438def19c8baaea28e742b6ed32ec Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:06 +0800 Subject: [PATCH 05/10] macvtap: zerocopy: set SKBTX_DEV_ZEROCOPY only when skb is built successfully Current the SKBTX_DEV_ZEROCOPY is set unconditionally after zerocopy_sg_from_iovec(), this would lead NULL pointer when macvtap fails to build zerocopy skb because destructor_arg was not initialized. Solve this by set this flag after the skb were built successfully. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/net/macvtap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 9ab182a40648..a4ff694cea22 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -699,10 +699,9 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, if (!skb) goto err; - if (zerocopy) { + if (zerocopy) err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; - } else + else err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len); if (err) @@ -721,8 +720,10 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, rcu_read_lock_bh(); vlan = rcu_dereference_bh(q->vlan); /* copy skb_ubuf_info for callback when skb has no error */ - if (zerocopy) + if (zerocopy) { skb_shinfo(skb)->destructor_arg = m->msg_control; + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + } if (vlan) macvlan_start_xmit(skb, vlan->dev); else From b92946e2919134ebe2a4083e4302236295ea2a73 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:15 +0800 Subject: [PATCH 06/10] macvtap: zerocopy: validate vectors before building skb There're several reasons that the vectors need to be validated: - Return error when caller provides vectors whose num is greater than UIO_MAXIOV. - Linearize part of skb when userspace provides vectors grater than MAX_SKB_FRAGS. - Return error when userspace provides vectors whose total length may exceed - MAX_SKB_FRAGS * PAGE_SIZE. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/net/macvtap.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index a4ff694cea22..163559c16988 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -529,9 +529,10 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, } base = (unsigned long)from->iov_base + offset; size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; + if (i + size > MAX_SKB_FRAGS) + return -EMSGSIZE; num_pages = get_user_pages_fast(base, size, 0, &page[i]); - if ((num_pages != size) || - (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) { + if (num_pages != size) { for (i = 0; i < num_pages; i++) put_page(page[i]); return -EFAULT; @@ -651,7 +652,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, int err; struct virtio_net_hdr vnet_hdr = { 0 }; int vnet_hdr_len = 0; - int copylen; + int copylen = 0; bool zerocopy = false; if (q->flags & IFF_VNET_HDR) { @@ -680,15 +681,31 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, if (unlikely(len < ETH_HLEN)) goto err; + err = -EMSGSIZE; + if (unlikely(count > UIO_MAXIOV)) + goto err; + if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) zerocopy = true; if (zerocopy) { + /* Userspace may produce vectors with count greater than + * MAX_SKB_FRAGS, so we need to linearize parts of the skb + * to let the rest of data to be fit in the frags. + */ + if (count > MAX_SKB_FRAGS) { + copylen = iov_length(iv, count - MAX_SKB_FRAGS); + if (copylen < vnet_hdr_len) + copylen = 0; + else + copylen -= vnet_hdr_len; + } /* There are 256 bytes to be copied in skb, so there is enough * room for skb expand head in case it is used. * The rest buffer is mapped from userspace. */ - copylen = vnet_hdr.hdr_len; + if (copylen < vnet_hdr.hdr_len) + copylen = vnet_hdr.hdr_len; if (!copylen) copylen = GOODCOPY_LEN; } else From c460f0573941cb28dc7f35595679c3508f0ce41f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:23 +0800 Subject: [PATCH 07/10] vhost_net: zerocopy: fix possible NULL pointer dereference of vq->bufs When we want to disable vhost_net backend while there's a tx work, a possible NULL pointer defernece may happen we we try to deference the vq->bufs after vhost_net_set_backend() assign a NULL to it. As suggested by Michael, fix this by checking the vq->bufs instead of vhost_sock_zcopy(). Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 1f21d2a1e528..35abe9019ba7 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -166,7 +166,7 @@ static void handle_tx(struct vhost_net *net) if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); hdr_size = vq->vhost_hlen; - zcopy = vhost_sock_zcopy(sock); + zcopy = vq->ubufs; for (;;) { /* Release DMAs done buffers first */ From dbf34207c62bdec16b49721d119647c470a3443c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:32 +0800 Subject: [PATCH 08/10] vhost_net: re-poll only on EAGAIN or ENOBUFS Currently, we restart tx polling unconditionally when sendmsg() fails. This would cause unnecessary wakeups of vhost wokers and waste cpu utlization when evil userspace(guest driver) is able to hit EFAULT or EINVAL. The polling is only needed when the socket send buffer were exceeded or not enough memory. So fix this by restarting polling only when sendmsg() returns EAGAIN/ENOBUFS. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 35abe9019ba7..f54b1d5fc234 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -257,7 +257,8 @@ static void handle_tx(struct vhost_net *net) UIO_MAXIOV; } vhost_discard_vq_desc(vq, 1); - tx_poll_start(net, sock); + if (err == -EAGAIN || err == -ENOBUFS) + tx_poll_start(net, sock); break; } if (err != len) From c8fb217af57c6c232af3517d3115d2af4ce9900e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:41 +0800 Subject: [PATCH 09/10] vhost_net: zerocopy: adding and signalling immediately when fully copied When a packet were fully copied in zerocopy, we don't wait for the DMA done to mark the done flag, so after the packet were passed to lower device, we need to add used and signal guest immediately. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/vhost/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f54b1d5fc234..853db7a08a26 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -266,6 +266,8 @@ static void handle_tx(struct vhost_net *net) " len %d != %zd\n", err, len); if (!zcopy) vhost_add_used_and_signal(&net->dev, vq, head, 0); + else + vhost_zerocopy_signal_used(vq); total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); From c70aa540c7a9f67add11ad3161096fb95233aa2e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 May 2012 11:42:54 +0800 Subject: [PATCH 10/10] vhost: zerocopy: poll vq in zerocopy callback We add used and signal guest in worker thread but did not poll the virtqueue during the zero copy callback. This may lead the missing of adding and signalling during zerocopy. Solve this by polling the virtqueue and let it wakeup the worker during callback. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 51e4c1eeec4f..94dbd25caa30 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1603,6 +1603,7 @@ void vhost_zerocopy_callback(struct ubuf_info *ubuf) struct vhost_ubuf_ref *ubufs = ubuf->ctx; struct vhost_virtqueue *vq = ubufs->vq; + vhost_poll_queue(&vq->poll); /* set len = 1 to mark this desc buffers done DMA */ vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; kref_put(&ubufs->kref, vhost_zerocopy_done_signal);