From e50d7607f1800c9f9c576229c6119e4c82f456d6 Mon Sep 17 00:00:00 2001
From: Liu Yuan <namei.unix@gmail.com>
Date: Sat, 14 Dec 2013 01:29:28 +0800
Subject: [PATCH 01/18] sheepdog: fix dynamic grow for running qcow2 format

When running qcow2 over sheepdog, we might meet following problem

  qemu-system-x86_64: shrinking is not supported

And cause IO errors to Guest. This is because we abuse bs->total_sectors, which
is manipulated by generic block layer and race with sheepdog code.

We should directly check if offset > vdi_size to dynamically enlarge the volume
instead of 'offset > bs->total_sectors', which will cause problem when following
case happens:

   vdi_size > offset > bs->total_sectors

   # then trigger sd_truncate() to shrink the volume wrongly.

Cc: qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Reported-by: Hadrien KOHL <hadrien.kohl@gmail.com>
Signed-off-by: Liu Yuan <namei.unix@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/sheepdog.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index d1c812df3d..ba451a97a4 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2048,13 +2048,14 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
 {
     SheepdogAIOCB *acb;
     int ret;
+    int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
+    BDRVSheepdogState *s = bs->opaque;
 
-    if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
-        ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE);
+    if (bs->growable && offset > s->inode.vdi_size) {
+        ret = sd_truncate(bs, offset);
         if (ret < 0) {
             return ret;
         }
-        bs->total_sectors = sector_num + nb_sectors;
     }
 
     acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);

From 4d684832e54afe971fd8f94cb830ec1e135648e7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 10 Dec 2013 13:26:58 +0100
Subject: [PATCH 02/18] vring: create a common function to parse descriptors

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/virtio/dataplane/vring.c | 113 ++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 62 deletions(-)

diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index 351a343806..8294f36d07 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -110,6 +110,47 @@ bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
     return vring_need_event(vring_used_event(&vring->vr), new, old);
 }
 
+
+static int get_desc(Vring *vring,
+                    struct iovec iov[], struct iovec *iov_end,
+                    unsigned int *out_num, unsigned int *in_num,
+                    struct vring_desc *desc)
+{
+    unsigned *num;
+
+    if (desc->flags & VRING_DESC_F_WRITE) {
+        num = in_num;
+    } else {
+        num = out_num;
+
+        /* If it's an output descriptor, they're all supposed
+         * to come before any input descriptors. */
+        if (unlikely(*in_num)) {
+            error_report("Descriptor has out after in");
+            return -EFAULT;
+        }
+    }
+
+    /* Stop for now if there are not enough iovecs available. */
+    iov += *in_num + *out_num;
+    if (iov >= iov_end) {
+        return -ENOBUFS;
+    }
+
+    /* TODO handle non-contiguous memory across region boundaries */
+    iov->iov_base = hostmem_lookup(&vring->hostmem, desc->addr, desc->len,
+                                   desc->flags & VRING_DESC_F_WRITE);
+    if (!iov->iov_base) {
+        error_report("Failed to map descriptor addr %#" PRIx64 " len %u",
+                     (uint64_t)desc->addr, desc->len);
+        return -EFAULT;
+    }
+
+    iov->iov_len = desc->len;
+    *num += 1;
+    return 0;
+}
+
 /* This is stolen from linux/drivers/vhost/vhost.c. */
 static int get_indirect(Vring *vring,
                         struct iovec iov[], struct iovec *iov_end,
@@ -118,6 +159,7 @@ static int get_indirect(Vring *vring,
 {
     struct vring_desc desc;
     unsigned int i = 0, count, found = 0;
+    int ret;
 
     /* Sanity check */
     if (unlikely(indirect->len % sizeof(desc))) {
@@ -170,36 +212,10 @@ static int get_indirect(Vring *vring,
             return -EFAULT;
         }
 
-        /* Stop for now if there are not enough iovecs available. */
-        if (iov >= iov_end) {
-            return -ENOBUFS;
-        }
-
-        iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len,
-                                       desc.flags & VRING_DESC_F_WRITE);
-        if (!iov->iov_base) {
-            error_report("Failed to map indirect descriptor"
-                         "addr %#" PRIx64 " len %u",
-                         (uint64_t)desc.addr, desc.len);
-            vring->broken = true;
-            return -EFAULT;
-        }
-        iov->iov_len = desc.len;
-        iov++;
-
-        /* If this is an input descriptor, increment that count. */
-        if (desc.flags & VRING_DESC_F_WRITE) {
-            *in_num += 1;
-        } else {
-            /* If it's an output descriptor, they're all supposed
-             * to come before any input descriptors. */
-            if (unlikely(*in_num)) {
-                error_report("Indirect descriptor "
-                             "has out after in: idx %u", i);
-                vring->broken = true;
-                return -EFAULT;
-            }
-            *out_num += 1;
+        ret = get_desc(vring, iov, iov_end, out_num, in_num, &desc);
+        if (ret < 0) {
+            vring->broken |= (ret == -EFAULT);
+            return ret;
         }
         i = desc.next;
     } while (desc.flags & VRING_DESC_F_NEXT);
@@ -224,6 +240,7 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
     struct vring_desc desc;
     unsigned int i, head, found = 0, num = vring->vr.num;
     uint16_t avail_idx, last_avail_idx;
+    int ret;
 
     /* If there was a fatal error then refuse operation */
     if (vring->broken) {
@@ -294,40 +311,12 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
             continue;
         }
 
-        /* If there are not enough iovecs left, stop for now.  The caller
-         * should check if there are more descs available once they have dealt
-         * with the current set.
-         */
-        if (iov >= iov_end) {
-            return -ENOBUFS;
+        ret = get_desc(vring, iov, iov_end, out_num, in_num, &desc);
+        if (ret < 0) {
+            vring->broken |= (ret == -EFAULT);
+            return ret;
         }
 
-        /* TODO handle non-contiguous memory across region boundaries */
-        iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len,
-                                       desc.flags & VRING_DESC_F_WRITE);
-        if (!iov->iov_base) {
-            error_report("Failed to map vring desc addr %#" PRIx64 " len %u",
-                         (uint64_t)desc.addr, desc.len);
-            vring->broken = true;
-            return -EFAULT;
-        }
-        iov->iov_len  = desc.len;
-        iov++;
-
-        if (desc.flags & VRING_DESC_F_WRITE) {
-            /* If this is an input descriptor,
-             * increment that count. */
-            *in_num += 1;
-        } else {
-            /* If it's an output descriptor, they're all supposed
-             * to come before any input descriptors. */
-            if (unlikely(*in_num)) {
-                error_report("Descriptor has out after in: idx %d", i);
-                vring->broken = true;
-                return -EFAULT;
-            }
-            *out_num += 1;
-        }
         i = desc.next;
     } while (desc.flags & VRING_DESC_F_NEXT);
 

From 781c117f3758bdb21e982d2aebba81febceccfe5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 10 Dec 2013 13:26:59 +0100
Subject: [PATCH 03/18] vring: factor common code for error exits

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/block/dataplane/virtio-blk.c |  1 +
 hw/virtio/dataplane/vring.c     | 34 ++++++++++++++++++++-------------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 1e57f3aabd..2b4a773f13 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -308,6 +308,7 @@ static void handle_notify(EventNotifier *e)
 
             if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
                 vring_set_broken(&s->vring);
+                ret = -EFAULT;
                 break;
             }
             iov += out_num + in_num;
diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index 8294f36d07..d81b653399 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -244,7 +244,8 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
 
     /* If there was a fatal error then refuse operation */
     if (vring->broken) {
-        return -EFAULT;
+        ret = -EFAULT;
+        goto out;
     }
 
     /* Check it isn't doing very strange things with descriptor numbers. */
@@ -255,13 +256,14 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
     if (unlikely((uint16_t)(avail_idx - last_avail_idx) > num)) {
         error_report("Guest moved used index from %u to %u",
                      last_avail_idx, avail_idx);
-        vring->broken = true;
-        return -EFAULT;
+        ret = -EFAULT;
+        goto out;
     }
 
     /* If there's nothing new since last we looked. */
     if (avail_idx == last_avail_idx) {
-        return -EAGAIN;
+        ret = -EAGAIN;
+        goto out;
     }
 
     /* Only get avail ring entries after they have been exposed by guest. */
@@ -274,8 +276,8 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
     /* If their number is silly, that's an error. */
     if (unlikely(head >= num)) {
         error_report("Guest says index %u > %u is available", head, num);
-        vring->broken = true;
-        return -EFAULT;
+        ret = -EFAULT;
+        goto out;
     }
 
     if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) {
@@ -289,14 +291,14 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
     do {
         if (unlikely(i >= num)) {
             error_report("Desc index is %u > %u, head = %u", i, num, head);
-            vring->broken = true;
-            return -EFAULT;
+            ret = -EFAULT;
+            goto out;
         }
         if (unlikely(++found > num)) {
             error_report("Loop detected: last one at %u vq size %u head %u",
                          i, num, head);
-            vring->broken = true;
-            return -EFAULT;
+            ret = -EFAULT;
+            goto out;
         }
         desc = vring->vr.desc[i];
 
@@ -306,15 +308,14 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
         if (desc.flags & VRING_DESC_F_INDIRECT) {
             int ret = get_indirect(vring, iov, iov_end, out_num, in_num, &desc);
             if (ret < 0) {
-                return ret;
+                goto out;
             }
             continue;
         }
 
         ret = get_desc(vring, iov, iov_end, out_num, in_num, &desc);
         if (ret < 0) {
-            vring->broken |= (ret == -EFAULT);
-            return ret;
+            goto out;
         }
 
         i = desc.next;
@@ -323,6 +324,13 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
     /* On success, increment avail index. */
     vring->last_avail_idx++;
     return head;
+
+out:
+    assert(ret < 0);
+    if (ret == -EFAULT) {
+        vring->broken = true;
+    }
+    return ret;
 }
 
 /* After we've used one of their buffers, we tell them about it.

From 8c1b566fd165af6cb12d6ef69eb554a347641e20 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 10 Dec 2013 13:27:00 +0100
Subject: [PATCH 04/18] dataplane: change vring API to use VirtQueueElement

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/block/dataplane/virtio-blk.c     | 85 +++++++++++------------------
 hw/virtio/dataplane/vring.c         | 56 ++++++++++++-------
 include/hw/virtio/dataplane/vring.h |  7 +--
 3 files changed, 72 insertions(+), 76 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 2b4a773f13..456d437ac3 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -35,7 +35,7 @@ enum {
 typedef struct {
     struct iocb iocb;               /* Linux AIO control block */
     QEMUIOVector *inhdr;            /* iovecs for virtio_blk_inhdr */
-    unsigned int head;              /* vring descriptor index */
+    VirtQueueElement *elem;         /* saved data from the virtqueue */
     struct iovec *bounce_iov;       /* used if guest buffers are unaligned */
     QEMUIOVector *read_qiov;        /* for read completion /w bounce buffer */
 } VirtIOBlockRequest;
@@ -96,7 +96,7 @@ static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
         len = 0;
     }
 
-    trace_virtio_blk_data_plane_complete_request(s, req->head, ret);
+    trace_virtio_blk_data_plane_complete_request(s, req->elem->index, ret);
 
     if (req->read_qiov) {
         assert(req->bounce_iov);
@@ -118,12 +118,12 @@ static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
      * written to, but for virtio-blk it seems to be the number of bytes
      * transferred plus the status bytes.
      */
-    vring_push(&s->vring, req->head, len + sizeof(hdr));
-
+    vring_push(&s->vring, req->elem, len + sizeof(hdr));
+    req->elem = NULL;
     s->num_reqs--;
 }
 
-static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head,
+static void complete_request_early(VirtIOBlockDataPlane *s, VirtQueueElement *elem,
                                    QEMUIOVector *inhdr, unsigned char status)
 {
     struct virtio_blk_inhdr hdr = {
@@ -134,26 +134,26 @@ static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head,
     qemu_iovec_destroy(inhdr);
     g_slice_free(QEMUIOVector, inhdr);
 
-    vring_push(&s->vring, head, sizeof(hdr));
+    vring_push(&s->vring, elem, sizeof(hdr));
     notify_guest(s);
 }
 
 /* Get disk serial number */
 static void do_get_id_cmd(VirtIOBlockDataPlane *s,
                           struct iovec *iov, unsigned int iov_cnt,
-                          unsigned int head, QEMUIOVector *inhdr)
+                          VirtQueueElement *elem, QEMUIOVector *inhdr)
 {
     char id[VIRTIO_BLK_ID_BYTES];
 
     /* Serial number not NUL-terminated when shorter than buffer */
     strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id));
     iov_from_buf(iov, iov_cnt, 0, id, sizeof(id));
-    complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
+    complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_OK);
 }
 
 static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
-                       struct iovec *iov, unsigned int iov_cnt,
-                       long long offset, unsigned int head,
+                       struct iovec *iov, unsigned iov_cnt,
+                       long long offset, VirtQueueElement *elem,
                        QEMUIOVector *inhdr)
 {
     struct iocb *iocb;
@@ -186,19 +186,20 @@ static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
 
     /* Fill in virtio block metadata needed for completion */
     VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
-    req->head = head;
+    req->elem = elem;
     req->inhdr = inhdr;
     req->bounce_iov = bounce_iov;
     req->read_qiov = read_qiov;
     return 0;
 }
 
-static int process_request(IOQueue *ioq, struct iovec iov[],
-                           unsigned int out_num, unsigned int in_num,
-                           unsigned int head)
+static int process_request(IOQueue *ioq, VirtQueueElement *elem)
 {
     VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
-    struct iovec *in_iov = &iov[out_num];
+    struct iovec *iov = elem->out_sg;
+    struct iovec *in_iov = elem->in_sg;
+    unsigned out_num = elem->out_num;
+    unsigned in_num = elem->in_num;
     struct virtio_blk_outhdr outhdr;
     QEMUIOVector *inhdr;
     size_t in_size;
@@ -229,29 +230,29 @@ static int process_request(IOQueue *ioq, struct iovec iov[],
 
     switch (outhdr.type) {
     case VIRTIO_BLK_T_IN:
-        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr);
+        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, elem, inhdr);
         return 0;
 
     case VIRTIO_BLK_T_OUT:
-        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr);
+        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, elem, inhdr);
         return 0;
 
     case VIRTIO_BLK_T_SCSI_CMD:
         /* TODO support SCSI commands */
-        complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
+        complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_UNSUPP);
         return 0;
 
     case VIRTIO_BLK_T_FLUSH:
         /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
         if (qemu_fdatasync(s->fd) < 0) {
-            complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR);
+            complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_IOERR);
         } else {
-            complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
+            complete_request_early(s, elem, inhdr, VIRTIO_BLK_S_OK);
         }
         return 0;
 
     case VIRTIO_BLK_T_GET_ID:
-        do_get_id_cmd(s, in_iov, in_num, head, inhdr);
+        do_get_id_cmd(s, in_iov, in_num, elem, inhdr);
         return 0;
 
     default:
@@ -267,29 +268,8 @@ static void handle_notify(EventNotifier *e)
     VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane,
                                            host_notifier);
 
-    /* There is one array of iovecs into which all new requests are extracted
-     * from the vring.  Requests are read from the vring and the translated
-     * descriptors are written to the iovecs array.  The iovecs do not have to
-     * persist across handle_notify() calls because the kernel copies the
-     * iovecs on io_submit().
-     *
-     * Handling io_submit() EAGAIN may require storing the requests across
-     * handle_notify() calls until the kernel has sufficient resources to
-     * accept more I/O.  This is not implemented yet.
-     */
-    struct iovec iovec[VRING_MAX];
-    struct iovec *end = &iovec[VRING_MAX];
-    struct iovec *iov = iovec;
-
-    /* When a request is read from the vring, the index of the first descriptor
-     * (aka head) is returned so that the completed request can be pushed onto
-     * the vring later.
-     *
-     * The number of hypervisor read-only iovecs is out_num.  The number of
-     * hypervisor write-only iovecs is in_num.
-     */
-    int head;
-    unsigned int out_num = 0, in_num = 0;
+    VirtQueueElement *elem;
+    int ret;
     unsigned int num_queued;
 
     event_notifier_test_and_clear(&s->host_notifier);
@@ -298,30 +278,31 @@ static void handle_notify(EventNotifier *e)
         vring_disable_notification(s->vdev, &s->vring);
 
         for (;;) {
-            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
-            if (head < 0) {
+            ret = vring_pop(s->vdev, &s->vring, &elem);
+            if (ret < 0) {
+                assert(elem == NULL);
                 break; /* no more requests */
             }
 
-            trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
-                                                        head);
+            trace_virtio_blk_data_plane_process_request(s, elem->out_num,
+                                                        elem->in_num, elem->index);
 
-            if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
+            if (process_request(&s->ioqueue, elem) < 0) {
                 vring_set_broken(&s->vring);
+                vring_free_element(elem);
                 ret = -EFAULT;
                 break;
             }
-            iov += out_num + in_num;
         }
 
-        if (likely(head == -EAGAIN)) { /* vring emptied */
+        if (likely(ret == -EAGAIN)) { /* vring emptied */
             /* Re-enable guest->host notifies and stop processing the vring.
              * But if the guest has snuck in more descriptors, keep processing.
              */
             if (vring_enable_notification(s->vdev, &s->vring)) {
                 break;
             }
-        } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */
+        } else { /* ret == -ENOBUFS or fatal error, iovecs[] is depleted */
             /* Since there are no iovecs[] left, stop processing for now.  Do
              * not re-enable guest->host notifies since the I/O completion
              * handler knows to check for more vring descriptors anyway.
diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index d81b653399..1bf9dd335a 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -111,29 +111,32 @@ bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
 }
 
 
-static int get_desc(Vring *vring,
-                    struct iovec iov[], struct iovec *iov_end,
-                    unsigned int *out_num, unsigned int *in_num,
+static int get_desc(Vring *vring, VirtQueueElement *elem,
                     struct vring_desc *desc)
 {
     unsigned *num;
+    struct iovec *iov;
+    hwaddr *addr;
 
     if (desc->flags & VRING_DESC_F_WRITE) {
-        num = in_num;
+        num = &elem->in_num;
+        iov = &elem->in_sg[*num];
+        addr = &elem->in_addr[*num];
     } else {
-        num = out_num;
+        num = &elem->out_num;
+        iov = &elem->out_sg[*num];
+        addr = &elem->out_addr[*num];
 
         /* If it's an output descriptor, they're all supposed
          * to come before any input descriptors. */
-        if (unlikely(*in_num)) {
+        if (unlikely(elem->in_num)) {
             error_report("Descriptor has out after in");
             return -EFAULT;
         }
     }
 
     /* Stop for now if there are not enough iovecs available. */
-    iov += *in_num + *out_num;
-    if (iov >= iov_end) {
+    if (*num >= VIRTQUEUE_MAX_SIZE) {
         return -ENOBUFS;
     }
 
@@ -147,14 +150,13 @@ static int get_desc(Vring *vring,
     }
 
     iov->iov_len = desc->len;
+    *addr = desc->addr;
     *num += 1;
     return 0;
 }
 
 /* This is stolen from linux/drivers/vhost/vhost.c. */
-static int get_indirect(Vring *vring,
-                        struct iovec iov[], struct iovec *iov_end,
-                        unsigned int *out_num, unsigned int *in_num,
+static int get_indirect(Vring *vring, VirtQueueElement *elem,
                         struct vring_desc *indirect)
 {
     struct vring_desc desc;
@@ -212,7 +214,7 @@ static int get_indirect(Vring *vring,
             return -EFAULT;
         }
 
-        ret = get_desc(vring, iov, iov_end, out_num, in_num, &desc);
+        ret = get_desc(vring, elem, &desc);
         if (ret < 0) {
             vring->broken |= (ret == -EFAULT);
             return ret;
@@ -222,6 +224,11 @@ static int get_indirect(Vring *vring,
     return 0;
 }
 
+void vring_free_element(VirtQueueElement *elem)
+{
+    g_slice_free(VirtQueueElement, elem);
+}
+
 /* This looks in the virtqueue and for the first available buffer, and converts
  * it to an iovec for convenient access.  Since descriptors consist of some
  * number of output then some number of input descriptors, it's actually two
@@ -234,12 +241,12 @@ static int get_indirect(Vring *vring,
  * Stolen from linux/drivers/vhost/vhost.c.
  */
 int vring_pop(VirtIODevice *vdev, Vring *vring,
-              struct iovec iov[], struct iovec *iov_end,
-              unsigned int *out_num, unsigned int *in_num)
+              VirtQueueElement **p_elem)
 {
     struct vring_desc desc;
     unsigned int i, head, found = 0, num = vring->vr.num;
     uint16_t avail_idx, last_avail_idx;
+    VirtQueueElement *elem = NULL;
     int ret;
 
     /* If there was a fatal error then refuse operation */
@@ -273,6 +280,10 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
      * the index we've seen. */
     head = vring->vr.avail->ring[last_avail_idx % num];
 
+    elem = g_slice_new(VirtQueueElement);
+    elem->index = head;
+    elem->in_num = elem->out_num = 0;
+    
     /* If their number is silly, that's an error. */
     if (unlikely(head >= num)) {
         error_report("Guest says index %u > %u is available", head, num);
@@ -284,9 +295,6 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
         vring_avail_event(&vring->vr) = vring->vr.avail->idx;
     }
 
-    /* When we start there are none of either input nor output. */
-    *out_num = *in_num = 0;
-
     i = head;
     do {
         if (unlikely(i >= num)) {
@@ -306,14 +314,14 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
         barrier();
 
         if (desc.flags & VRING_DESC_F_INDIRECT) {
-            int ret = get_indirect(vring, iov, iov_end, out_num, in_num, &desc);
+            int ret = get_indirect(vring, elem, &desc);
             if (ret < 0) {
                 goto out;
             }
             continue;
         }
 
-        ret = get_desc(vring, iov, iov_end, out_num, in_num, &desc);
+        ret = get_desc(vring, elem, &desc);
         if (ret < 0) {
             goto out;
         }
@@ -323,6 +331,7 @@ int vring_pop(VirtIODevice *vdev, Vring *vring,
 
     /* On success, increment avail index. */
     vring->last_avail_idx++;
+    *p_elem = elem;
     return head;
 
 out:
@@ -330,6 +339,10 @@ out:
     if (ret == -EFAULT) {
         vring->broken = true;
     }
+    if (elem) {
+        vring_free_element(elem);
+    }
+    *p_elem = NULL;
     return ret;
 }
 
@@ -337,11 +350,14 @@ out:
  *
  * Stolen from linux/drivers/vhost/vhost.c.
  */
-void vring_push(Vring *vring, unsigned int head, int len)
+void vring_push(Vring *vring, VirtQueueElement *elem, int len)
 {
     struct vring_used_elem *used;
+    unsigned int head = elem->index;
     uint16_t new;
 
+    vring_free_element(elem);
+
     /* Don't touch vring if a fatal error occurred */
     if (vring->broken) {
         return;
diff --git a/include/hw/virtio/dataplane/vring.h b/include/hw/virtio/dataplane/vring.h
index c0b69ff18f..3d6c0dfc76 100644
--- a/include/hw/virtio/dataplane/vring.h
+++ b/include/hw/virtio/dataplane/vring.h
@@ -54,9 +54,8 @@ void vring_teardown(Vring *vring, VirtIODevice *vdev, int n);
 void vring_disable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_enable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_should_notify(VirtIODevice *vdev, Vring *vring);
-int vring_pop(VirtIODevice *vdev, Vring *vring,
-              struct iovec iov[], struct iovec *iov_end,
-              unsigned int *out_num, unsigned int *in_num);
-void vring_push(Vring *vring, unsigned int head, int len);
+int vring_pop(VirtIODevice *vdev, Vring *vring, VirtQueueElement **elem);
+void vring_push(Vring *vring, VirtQueueElement *elem, int len);
+void vring_free_element(VirtQueueElement *elem);
 
 #endif /* VRING_H */

From 87b7f2f8c8da4d2da2728f0f1ad207973f1ea834 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 10 Dec 2013 13:27:01 +0100
Subject: [PATCH 05/18] dataplane: replace hostmem with memory_region_find

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/virtio/dataplane/Makefile.objs     |   2 +-
 hw/virtio/dataplane/hostmem.c         | 183 --------------------------
 hw/virtio/dataplane/vring.c           |  78 +++++++++--
 include/hw/virtio/dataplane/hostmem.h |  58 --------
 include/hw/virtio/dataplane/vring.h   |   3 +-
 5 files changed, 72 insertions(+), 252 deletions(-)
 delete mode 100644 hw/virtio/dataplane/hostmem.c
 delete mode 100644 include/hw/virtio/dataplane/hostmem.h

diff --git a/hw/virtio/dataplane/Makefile.objs b/hw/virtio/dataplane/Makefile.objs
index a91bf33c8b..9a8cfc0297 100644
--- a/hw/virtio/dataplane/Makefile.objs
+++ b/hw/virtio/dataplane/Makefile.objs
@@ -1 +1 @@
-common-obj-y += hostmem.o vring.o
+common-obj-y += vring.o
diff --git a/hw/virtio/dataplane/hostmem.c b/hw/virtio/dataplane/hostmem.c
deleted file mode 100644
index 901d98b8a0..0000000000
--- a/hw/virtio/dataplane/hostmem.c
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Thread-safe guest to host memory mapping
- *
- * Copyright 2012 Red Hat, Inc. and/or its affiliates
- *
- * Authors:
- *   Stefan Hajnoczi <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-
-#include "exec/address-spaces.h"
-#include "hw/virtio/dataplane/hostmem.h"
-
-static int hostmem_lookup_cmp(const void *phys_, const void *region_)
-{
-    hwaddr phys = *(const hwaddr *)phys_;
-    const HostMemRegion *region = region_;
-
-    if (phys < region->guest_addr) {
-        return -1;
-    } else if (phys >= region->guest_addr + region->size) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-/**
- * Map guest physical address to host pointer
- */
-void *hostmem_lookup(HostMem *hostmem, hwaddr phys, hwaddr len, bool is_write)
-{
-    HostMemRegion *region;
-    void *host_addr = NULL;
-    hwaddr offset_within_region;
-
-    qemu_mutex_lock(&hostmem->current_regions_lock);
-    region = bsearch(&phys, hostmem->current_regions,
-                     hostmem->num_current_regions,
-                     sizeof(hostmem->current_regions[0]),
-                     hostmem_lookup_cmp);
-    if (!region) {
-        goto out;
-    }
-    if (is_write && region->readonly) {
-        goto out;
-    }
-    offset_within_region = phys - region->guest_addr;
-    if (len <= region->size - offset_within_region) {
-        host_addr = region->host_addr + offset_within_region;
-    }
-out:
-    qemu_mutex_unlock(&hostmem->current_regions_lock);
-
-    return host_addr;
-}
-
-/**
- * Install new regions list
- */
-static void hostmem_listener_commit(MemoryListener *listener)
-{
-    HostMem *hostmem = container_of(listener, HostMem, listener);
-    int i;
-
-    qemu_mutex_lock(&hostmem->current_regions_lock);
-    for (i = 0; i < hostmem->num_current_regions; i++) {
-        memory_region_unref(hostmem->current_regions[i].mr);
-    }
-    g_free(hostmem->current_regions);
-    hostmem->current_regions = hostmem->new_regions;
-    hostmem->num_current_regions = hostmem->num_new_regions;
-    qemu_mutex_unlock(&hostmem->current_regions_lock);
-
-    /* Reset new regions list */
-    hostmem->new_regions = NULL;
-    hostmem->num_new_regions = 0;
-}
-
-/**
- * Add a MemoryRegionSection to the new regions list
- */
-static void hostmem_append_new_region(HostMem *hostmem,
-                                      MemoryRegionSection *section)
-{
-    void *ram_ptr = memory_region_get_ram_ptr(section->mr);
-    size_t num = hostmem->num_new_regions;
-    size_t new_size = (num + 1) * sizeof(hostmem->new_regions[0]);
-
-    hostmem->new_regions = g_realloc(hostmem->new_regions, new_size);
-    hostmem->new_regions[num] = (HostMemRegion){
-        .host_addr = ram_ptr + section->offset_within_region,
-        .guest_addr = section->offset_within_address_space,
-        .size = int128_get64(section->size),
-        .readonly = section->readonly,
-        .mr = section->mr,
-    };
-    hostmem->num_new_regions++;
-
-    memory_region_ref(section->mr);
-}
-
-static void hostmem_listener_append_region(MemoryListener *listener,
-                                           MemoryRegionSection *section)
-{
-    HostMem *hostmem = container_of(listener, HostMem, listener);
-
-    /* Ignore non-RAM regions, we may not be able to map them */
-    if (!memory_region_is_ram(section->mr)) {
-        return;
-    }
-
-    /* Ignore regions with dirty logging, we cannot mark them dirty */
-    if (memory_region_is_logging(section->mr)) {
-        return;
-    }
-
-    hostmem_append_new_region(hostmem, section);
-}
-
-/* We don't implement most MemoryListener callbacks, use these nop stubs */
-static void hostmem_listener_dummy(MemoryListener *listener)
-{
-}
-
-static void hostmem_listener_section_dummy(MemoryListener *listener,
-                                           MemoryRegionSection *section)
-{
-}
-
-static void hostmem_listener_eventfd_dummy(MemoryListener *listener,
-                                           MemoryRegionSection *section,
-                                           bool match_data, uint64_t data,
-                                           EventNotifier *e)
-{
-}
-
-static void hostmem_listener_coalesced_mmio_dummy(MemoryListener *listener,
-                                                  MemoryRegionSection *section,
-                                                  hwaddr addr, hwaddr len)
-{
-}
-
-void hostmem_init(HostMem *hostmem)
-{
-    memset(hostmem, 0, sizeof(*hostmem));
-
-    qemu_mutex_init(&hostmem->current_regions_lock);
-
-    hostmem->listener = (MemoryListener){
-        .begin = hostmem_listener_dummy,
-        .commit = hostmem_listener_commit,
-        .region_add = hostmem_listener_append_region,
-        .region_del = hostmem_listener_section_dummy,
-        .region_nop = hostmem_listener_append_region,
-        .log_start = hostmem_listener_section_dummy,
-        .log_stop = hostmem_listener_section_dummy,
-        .log_sync = hostmem_listener_section_dummy,
-        .log_global_start = hostmem_listener_dummy,
-        .log_global_stop = hostmem_listener_dummy,
-        .eventfd_add = hostmem_listener_eventfd_dummy,
-        .eventfd_del = hostmem_listener_eventfd_dummy,
-        .coalesced_mmio_add = hostmem_listener_coalesced_mmio_dummy,
-        .coalesced_mmio_del = hostmem_listener_coalesced_mmio_dummy,
-        .priority = 10,
-    };
-
-    memory_listener_register(&hostmem->listener, &address_space_memory);
-    if (hostmem->num_new_regions > 0) {
-        hostmem_listener_commit(&hostmem->listener);
-    }
-}
-
-void hostmem_finalize(HostMem *hostmem)
-{
-    memory_listener_unregister(&hostmem->listener);
-    g_free(hostmem->new_regions);
-    g_free(hostmem->current_regions);
-    qemu_mutex_destroy(&hostmem->current_regions_lock);
-}
diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index 1bf9dd335a..250d45ec3d 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -15,9 +15,53 @@
  */
 
 #include "trace.h"
+#include "hw/hw.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
 #include "hw/virtio/dataplane/vring.h"
 #include "qemu/error-report.h"
 
+/* vring_map can be coupled with vring_unmap or (if you still have the
+ * value returned in *mr) memory_region_unref.
+ */
+static void *vring_map(MemoryRegion **mr, hwaddr phys, hwaddr len,
+                       bool is_write)
+{
+    MemoryRegionSection section = memory_region_find(get_system_memory(), phys, len);
+
+    if (!section.mr || int128_get64(section.size) < len) {
+        goto out;
+    }
+    if (is_write && section.readonly) {
+        goto out;
+    }
+    if (!memory_region_is_ram(section.mr)) {
+        goto out;
+    }
+
+    /* Ignore regions with dirty logging, we cannot mark them dirty */
+    if (memory_region_is_logging(section.mr)) {
+        goto out;
+    }
+
+    *mr = section.mr;
+    return memory_region_get_ram_ptr(section.mr) + section.offset_within_region;
+
+out:
+    memory_region_unref(section.mr);
+    *mr = NULL;
+    return NULL;
+}
+
+static void vring_unmap(void *buffer, bool is_write)
+{
+    ram_addr_t addr;
+    MemoryRegion *mr;
+
+    mr = qemu_ram_addr_from_host(buffer, &addr);
+    memory_region_unref(mr);
+}
+
 /* Map the guest's vring to host memory */
 bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
 {
@@ -27,8 +71,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
 
     vring->broken = false;
 
-    hostmem_init(&vring->hostmem);
-    vring_ptr = hostmem_lookup(&vring->hostmem, vring_addr, vring_size, true);
+    vring_ptr = vring_map(&vring->mr, vring_addr, vring_size, true);
     if (!vring_ptr) {
         error_report("Failed to map vring "
                      "addr %#" HWADDR_PRIx " size %" HWADDR_PRIu,
@@ -54,7 +97,7 @@ void vring_teardown(Vring *vring, VirtIODevice *vdev, int n)
     virtio_queue_set_last_avail_idx(vdev, n, vring->last_avail_idx);
     virtio_queue_invalidate_signalled_used(vdev, n);
 
-    hostmem_finalize(&vring->hostmem);
+    memory_region_unref(vring->mr);
 }
 
 /* Disable guest->host notifies */
@@ -117,6 +160,7 @@ static int get_desc(Vring *vring, VirtQueueElement *elem,
     unsigned *num;
     struct iovec *iov;
     hwaddr *addr;
+    MemoryRegion *mr;
 
     if (desc->flags & VRING_DESC_F_WRITE) {
         num = &elem->in_num;
@@ -141,14 +185,16 @@ static int get_desc(Vring *vring, VirtQueueElement *elem,
     }
 
     /* TODO handle non-contiguous memory across region boundaries */
-    iov->iov_base = hostmem_lookup(&vring->hostmem, desc->addr, desc->len,
-                                   desc->flags & VRING_DESC_F_WRITE);
+    iov->iov_base = vring_map(&mr, desc->addr, desc->len,
+                              desc->flags & VRING_DESC_F_WRITE);
     if (!iov->iov_base) {
         error_report("Failed to map descriptor addr %#" PRIx64 " len %u",
                      (uint64_t)desc->addr, desc->len);
         return -EFAULT;
     }
 
+    /* The MemoryRegion is looked up again and unref'ed later, leave the
+     * ref in place.  */
     iov->iov_len = desc->len;
     *addr = desc->addr;
     *num += 1;
@@ -183,11 +229,12 @@ static int get_indirect(Vring *vring, VirtQueueElement *elem,
 
     do {
         struct vring_desc *desc_ptr;
+        MemoryRegion *mr;
 
         /* Translate indirect descriptor */
-        desc_ptr = hostmem_lookup(&vring->hostmem,
-                                  indirect->addr + found * sizeof(desc),
-                                  sizeof(desc), false);
+        desc_ptr = vring_map(&mr,
+                             indirect->addr + found * sizeof(desc),
+                             sizeof(desc), false);
         if (!desc_ptr) {
             error_report("Failed to map indirect descriptor "
                          "addr %#" PRIx64 " len %zu",
@@ -197,6 +244,7 @@ static int get_indirect(Vring *vring, VirtQueueElement *elem,
             return -EFAULT;
         }
         desc = *desc_ptr;
+        memory_region_unref(mr);
 
         /* Ensure descriptor has been loaded before accessing fields */
         barrier(); /* read_barrier_depends(); */
@@ -226,6 +274,20 @@ static int get_indirect(Vring *vring, VirtQueueElement *elem,
 
 void vring_free_element(VirtQueueElement *elem)
 {
+    int i;
+
+    /* This assumes that the iovecs, if changed, are never moved past
+     * the end of the valid area.  This is true if iovec manipulations
+     * are done with iov_discard_front and iov_discard_back.
+     */
+    for (i = 0; i < elem->out_num; i++) {
+        vring_unmap(elem->out_sg[i].iov_base, false);
+    }
+
+    for (i = 0; i < elem->in_num; i++) {
+        vring_unmap(elem->in_sg[i].iov_base, true);
+    }
+
     g_slice_free(VirtQueueElement, elem);
 }
 
diff --git a/include/hw/virtio/dataplane/hostmem.h b/include/hw/virtio/dataplane/hostmem.h
deleted file mode 100644
index 2810f4b44e..0000000000
--- a/include/hw/virtio/dataplane/hostmem.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Thread-safe guest to host memory mapping
- *
- * Copyright 2012 Red Hat, Inc. and/or its affiliates
- *
- * Authors:
- *   Stefan Hajnoczi <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-
-#ifndef HOSTMEM_H
-#define HOSTMEM_H
-
-#include "exec/memory.h"
-#include "qemu/thread.h"
-
-typedef struct {
-    MemoryRegion *mr;
-    void *host_addr;
-    hwaddr guest_addr;
-    uint64_t size;
-    bool readonly;
-} HostMemRegion;
-
-typedef struct {
-    /* The listener is invoked when regions change and a new list of regions is
-     * built up completely before they are installed.
-     */
-    MemoryListener listener;
-    HostMemRegion *new_regions;
-    size_t num_new_regions;
-
-    /* Current regions are accessed from multiple threads either to lookup
-     * addresses or to install a new list of regions.  The lock protects the
-     * pointer and the regions.
-     */
-    QemuMutex current_regions_lock;
-    HostMemRegion *current_regions;
-    size_t num_current_regions;
-} HostMem;
-
-void hostmem_init(HostMem *hostmem);
-void hostmem_finalize(HostMem *hostmem);
-
-/**
- * Map a guest physical address to a pointer
- *
- * Note that there is map/unmap mechanism here.  The caller must ensure that
- * mapped memory is no longer used across events like hot memory unplug.  This
- * can be done with other mechanisms like bdrv_drain_all() that quiesce
- * in-flight I/O.
- */
-void *hostmem_lookup(HostMem *hostmem, hwaddr phys, hwaddr len, bool is_write);
-
-#endif /* HOSTMEM_H */
diff --git a/include/hw/virtio/dataplane/vring.h b/include/hw/virtio/dataplane/vring.h
index 3d6c0dfc76..63e7bf4256 100644
--- a/include/hw/virtio/dataplane/vring.h
+++ b/include/hw/virtio/dataplane/vring.h
@@ -19,11 +19,10 @@
 
 #include <linux/virtio_ring.h>
 #include "qemu-common.h"
-#include "hostmem.h"
 #include "hw/virtio/virtio.h"
 
 typedef struct {
-    HostMem hostmem;                /* guest memory mapper */
+    MemoryRegion *mr;               /* memory region containing the vring */
     struct vring vr;                /* virtqueue vring mapped to host memory */
     uint16_t last_avail_idx;        /* last processed avail ring index */
     uint16_t last_used_idx;         /* last processed used ring index */

From c27de2a3e9c8664116287d639bacd600e61a6b45 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 11 Dec 2013 09:49:14 +0100
Subject: [PATCH 06/18] qapi-schema: fix QEMU 1.8 references

We are moving boldly on to QEMU 2.0 in the next release.  Some patches
written at a time where we assumed 1.8 would be the next version number
managed to sneak in.

s/1.8/2.0/ in qapi-schema.json

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 qapi-schema.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index c3c939c8c3..5aa4581792 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -3022,7 +3022,7 @@
 #
 # @devname: #optional path of the netmap device (default: '/dev/netmap').
 #
-# Since 1.8
+# Since 2.0
 ##
 { 'type': 'NetdevNetmapOptions',
   'data': {

From 219c252193862898430e5dea5efb7447877aaa85 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Tue, 17 Dec 2013 08:57:10 +0100
Subject: [PATCH 07/18] block/iscsi: Fix compilation for libiscsi 1.4.0 (API
 change)

Function iscsi_read10_task got additional parameters starting with version
libiscsi 1.5.0.

libiscsi 1.4.0 is still widely used (Debian wheezy, jessie and other Linux
distributions currently provide packages for QEMU which use it), so we
still need support for this older API.

Reviewed-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/iscsi.c |  5 ++++-
 configure     | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index fa69408df9..294b2c6ab3 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -359,7 +359,10 @@ retry:
     default:
         iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba,
                                        num_sectors * iscsilun->block_size,
-                                       iscsilun->block_size, 0, 0, 0, 0, 0,
+                                       iscsilun->block_size,
+#if !defined(CONFIG_LIBISCSI_1_4) /* API change from 1.4.0 to 1.5.0 */
+                                       0, 0, 0, 0, 0,
+#endif
                                        iscsi_co_generic_cb, &iTask);
         break;
     }
diff --git a/configure b/configure
index 07b6be34ff..d97556cc76 100755
--- a/configure
+++ b/configure
@@ -3078,6 +3078,21 @@ EOF
   fi
 fi
 
+# We also need to know the API version because there was an
+# API change from 1.4.0 to 1.5.0.
+if test "$libiscsi" = "yes"; then
+  cat >$TMPC <<EOF
+#include <iscsi/iscsi.h>
+int main(void)
+{
+  iscsi_read10_task(0, 0, 0, 0, 0, 0, 0);
+  return 0;
+}
+EOF
+  if compile_prog "" "-liscsi"; then
+    libiscsi_version="1.4.0"
+  fi
+fi
 
 ##########################################
 # Do we need libm
@@ -3805,7 +3820,11 @@ echo "nss used          $smartcard_nss"
 echo "libusb            $libusb"
 echo "usb net redir     $usb_redir"
 echo "GLX support       $glx"
+if test "$libiscsi_version" = "1.4.0"; then
+echo "libiscsi support  $libiscsi (1.4.0)"
+else
 echo "libiscsi support  $libiscsi"
+fi
 echo "build guest agent $guest_agent"
 echo "QGA VSS support   $guest_agent_with_vss"
 echo "seccomp support   $seccomp"
@@ -4137,6 +4156,9 @@ fi
 
 if test "$libiscsi" = "yes" ; then
   echo "CONFIG_LIBISCSI=y" >> $config_host_mak
+  if test "$libiscsi_version" = "1.4.0"; then
+    echo "CONFIG_LIBISCSI_1_4=y" >> $config_host_mak
+  fi
 fi
 
 if test "$seccomp" = "yes"; then

From 7e30e6a6746b417c7e0dbc9af009560fbb63f336 Mon Sep 17 00:00:00 2001
From: Jeff Cody <jcody@redhat.com>
Date: Tue, 17 Dec 2013 05:33:37 -0500
Subject: [PATCH 08/18] block: vhdx - improve error message, and .bdrv_check
 implementation

If there is a dirty log file to be replayed in a VHDX image, it is
replayed in .vhdx_open().  However, if the file is opened read-only,
then a somewhat cryptic error message results.

This adds a more helpful error message for the user.  If an image file
contains a log to be replayed, and is opened read-only, the user is
instructed to run 'qemu-img check -r all' on the image file.

Running qemu-img check -r all will cause the image file to be opened
r/w, which will replay the log file.  If a log file replay is detected,
this is flagged, and bdrv_check will increase the corruptions_fixed
count for the image.

[Fixed typo in error message that was pointed out by Eric Blake
<eblake@redhat.com>.
--Stefan]

Signed-off-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vhdx-log.c | 13 ++++++++++++-
 block/vhdx.c     | 22 ++++++++++++++++++++--
 block/vhdx.h     |  5 ++++-
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index ee5583c309..8c9ae0d8e7 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -706,7 +706,8 @@ exit:
  *
  * If read-only, we must replay the log in RAM (or refuse to open
  * a dirty VHDX file read-only) */
-int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed)
+int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
+                   Error **errp)
 {
     int ret = 0;
     VHDXHeader *hdr;
@@ -761,6 +762,16 @@ int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed)
     }
 
     if (logs.valid) {
+        if (bs->read_only) {
+            ret = -EPERM;
+            error_setg_errno(errp, EPERM,
+                             "VHDX image file '%s' opened read-only, but "
+                             "contains a log that needs to be replayed.  To "
+                             "replay the log, execute:\n qemu-img check -r "
+                             "all '%s'",
+                             bs->filename, bs->filename);
+            goto exit;
+        }
         /* now flush the log */
         ret = vhdx_log_flush(bs, s, &logs);
         if (ret < 0) {
diff --git a/block/vhdx.c b/block/vhdx.c
index 67bbe103a1..1995778945 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -878,7 +878,6 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
     int ret = 0;
     uint32_t i;
     uint64_t signature;
-    bool log_flushed = false;
 
 
     s->bat = NULL;
@@ -907,7 +906,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
-    ret = vhdx_parse_log(bs, s, &log_flushed);
+    ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp);
     if (ret < 0) {
         goto fail;
     }
@@ -1854,6 +1853,24 @@ exit:
     return ret;
 }
 
+/* If opened r/w, the VHDX driver will automatically replay the log,
+ * if one is present, inside the vhdx_open() call.
+ *
+ * If qemu-img check -r all is called, the image is automatically opened
+ * r/w and any log has already been replayed, so there is nothing (currently)
+ * for us to do here
+ */
+static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result,
+                       BdrvCheckMode fix)
+{
+    BDRVVHDXState *s = bs->opaque;
+
+    if (s->log_replayed_on_open) {
+        result->corruptions_fixed++;
+    }
+    return 0;
+}
+
 static QEMUOptionParameter vhdx_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -1898,6 +1915,7 @@ static BlockDriver bdrv_vhdx = {
     .bdrv_co_writev         = vhdx_co_writev,
     .bdrv_create            = vhdx_create,
     .bdrv_get_info          = vhdx_get_info,
+    .bdrv_check             = vhdx_check,
 
     .create_options         = vhdx_create_options,
 };
diff --git a/block/vhdx.h b/block/vhdx.h
index 51183b243c..2acd7c2d19 100644
--- a/block/vhdx.h
+++ b/block/vhdx.h
@@ -394,6 +394,8 @@ typedef struct BDRVVHDXState {
 
     Error *migration_blocker;
 
+    bool log_replayed_on_open;
+
     QLIST_HEAD(VHDXRegionHead, VHDXRegionEntry) regions;
 } BDRVVHDXState;
 
@@ -408,7 +410,8 @@ uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size,
 
 bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset);
 
-int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed);
+int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed,
+                   Error **errp);
 
 int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s,
                              void *data, uint32_t length, uint64_t offset);

From 8282db1b2e7394574cb55fcc608c5cb0df159d8f Mon Sep 17 00:00:00 2001
From: Jeff Cody <jcody@redhat.com>
Date: Tue, 17 Dec 2013 13:56:06 -0500
Subject: [PATCH 09/18] docs: updated qemu-img man page and qemu-doc to reflect
 VHDX support.

The man page for qemu-img, and the qemu-doc, did not mention VHDX
as a supported format.  This adds in reference to VHDX in those
documents.

[Stefan Weil <sw@weilnetz.de> suggested s/Block Size/Block size/ for
consistency.  I have made this change.
--Stefan]

Signed-off-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qemu-doc.texi | 15 +++++++++++++++
 qemu-img.texi |  4 ++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/qemu-doc.texi b/qemu-doc.texi
index 185dd47a03..4e9c6e9b6e 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -654,6 +654,21 @@ Supported options:
 Specifies which VHD subformat to use. Valid options are
 @code{dynamic} (default) and @code{fixed}.
 @end table
+
+@item VHDX
+Hyper-V compatible image format (VHDX).
+Supported options:
+@table @code
+@item subformat
+Specifies which VHDX subformat to use. Valid options are
+@code{dynamic} (default) and @code{fixed}.
+@item block_state_zero
+Force use of payload blocks of type 'ZERO'.
+@item block_size
+Block size; min 1 MB, max 256 MB.  0 means auto-calculate based on image size.
+@item log_size
+Log size; min 1 MB.
+@end table
 @end table
 
 @subsubsection Read-only formats
diff --git a/qemu-img.texi b/qemu-img.texi
index be31191e43..1bba91efde 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -431,8 +431,8 @@ This option can only be enabled if @code{compat=1.1} is specified.
 
 @item Other
 QEMU also supports various other image file formats for compatibility with
-older QEMU versions or other hypervisors, including VMDK, VDI, VHD (vpc), qcow1
-and QED. For a full list of supported formats see @code{qemu-img --help}.
+older QEMU versions or other hypervisors, including VMDK, VDI, VHD (vpc), VHDX,
+qcow1 and QED. For a full list of supported formats see @code{qemu-img --help}.
 For a more detailed description of these formats, see the QEMU Emulation User
 Documentation.
 

From b47053bd0359c68094d7a25a65687c0844771e34 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Mon, 9 Dec 2013 13:24:36 +0800
Subject: [PATCH 10/18] vmdk: Check VMFS extent line field number

VMFS extent line in description file should be with 4 fields:

    RW <size> VMFS "file-name.vmdk"

Check the number explicitly and report error if offset is appended as
FLAT, which should be invalid format.

Reported-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vmdk.c               |  9 +++++++--
 tests/qemu-iotests/059     | 14 ++++++++++++++
 tests/qemu-iotests/059.out |  5 +++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 0734bc200c..7917ad0c06 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -749,9 +749,14 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
                 return -EINVAL;
             }
         } else if (!strcmp(type, "VMFS")) {
-            flat_offset = 0;
+            if (ret == 4) {
+                flat_offset = 0;
+            } else {
+                error_setg(errp, "Invalid extent lines:\n%s", p);
+                return -EINVAL;
+            }
         } else if (ret != 4) {
-            error_setg(errp, "Invalid extent lines: \n%s", p);
+            error_setg(errp, "Invalid extent lines:\n%s", p);
             return -EINVAL;
         }
 
diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059
index 73941c3e61..65bea1d6c6 100755
--- a/tests/qemu-iotests/059
+++ b/tests/qemu-iotests/059
@@ -80,6 +80,20 @@ echo "=== Testing big twoGbMaxExtentFlat ==="
 IMGOPTS="subformat=twoGbMaxExtentFlat" _make_test_img 1000G
 $QEMU_IMG info $TEST_IMG | _filter_testdir | sed -e 's/cid: [0-9]*/cid: XXXXXXXX/'
 
+echo
+echo "=== Testing malformed VMFS extent description line ==="
+cat >"$TEST_IMG" <<EOF
+# Disk DescriptorFile
+version=1
+CID=58ab4847
+parentCID=ffffffff
+createType="vmfs"
+
+# Extent description
+RW 12582912 VMFS "dummy.vmdk" 1
+EOF
+_img_info
+
 echo
 echo "=== Testing version 3 ==="
 _use_sample_img iotest-version3.vmdk.bz2
diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
index 4ff935c6f4..16ab7c6c1f 100644
--- a/tests/qemu-iotests/059.out
+++ b/tests/qemu-iotests/059.out
@@ -2038,6 +2038,11 @@ Format specific information:
             filename: TEST_DIR/t-f500.vmdk
             format: FLAT
 
+=== Testing malformed VMFS extent description line ===
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Invalid extent lines:
+RW 12582912 VMFS "dummy.IMGFMT" 1
+
+
 === Testing version 3 ===
 image: TEST_DIR/iotest-version3.IMGFMT
 file format: IMGFMT

From 917703c179d4483d050fe112a2a81acec3d6ffd7 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 20 Dec 2013 09:48:48 +0800
Subject: [PATCH 11/18] vmdk: Allow vmdk_create to work with protocol

This improves vmdk_create to use bdrv_* functions to replace qemu_open
and other fd functions. The error handling are improved as well. One
difference is that bdrv_pwrite will round up buffer to sectors, so for
description file, an extra bdrv_truncate is used in the end to drop
inding zeros.

Notes:

 - A bonus bug fix is correct endian is used in initializing GD entries.

 - ROUND_UP and DIV_ROUND_UP are used where possible.

I tested that new code produces exactly the same file as previously.

Signed-off-by: Fam Zheng <famz@redhat.com>
Tested-by: Peter Lieven <pl@kamp.de>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/vmdk.c | 164 +++++++++++++++++++++++++++++----------------------
 1 file changed, 95 insertions(+), 69 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 7917ad0c06..c6b60b4a91 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1452,23 +1452,33 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
 }
 
 static int vmdk_create_extent(const char *filename, int64_t filesize,
-                              bool flat, bool compress, bool zeroed_grain)
+                              bool flat, bool compress, bool zeroed_grain,
+                              Error **errp)
 {
     int ret, i;
-    int fd = 0;
+    BlockDriverState *bs = NULL;
     VMDK4Header header;
-    uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
+    Error *local_err;
+    uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
+    uint32_t *gd_buf = NULL;
+    int gd_buf_size;
 
-    fd = qemu_open(filename,
-                   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-                   0644);
-    if (fd < 0) {
-        return -errno;
+    ret = bdrv_create_file(filename, NULL, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
     }
+
+    ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
+    }
+
     if (flat) {
-        ret = ftruncate(fd, filesize);
+        ret = bdrv_truncate(bs, filesize);
         if (ret < 0) {
-            ret = -errno;
+            error_setg(errp, "Could not truncate file");
         }
         goto exit;
     }
@@ -1479,24 +1489,23 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
                    | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
                    | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
     header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
-    header.capacity = filesize / 512;
+    header.capacity = filesize / BDRV_SECTOR_SIZE;
     header.granularity = 128;
-    header.num_gtes_per_gt = 512;
+    header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
 
-    grains = (filesize / 512 + header.granularity - 1) / header.granularity;
-    gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9;
-    gt_count =
-        (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt;
-    gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
+    grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
+    gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
+                           BDRV_SECTOR_SIZE);
+    gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
+    gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
 
     header.desc_offset = 1;
     header.desc_size = 20;
     header.rgd_offset = header.desc_offset + header.desc_size;
-    header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
+    header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
     header.grain_offset =
-       ((header.gd_offset + gd_size + (gt_size * gt_count) +
-         header.granularity - 1) / header.granularity) *
-        header.granularity;
+        ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
+                 header.granularity);
     /* swap endianness for all header fields */
     header.version = cpu_to_le32(header.version);
     header.flags = cpu_to_le32(header.flags);
@@ -1516,48 +1525,55 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
     header.check_bytes[3] = 0xa;
 
     /* write all the data */
-    ret = qemu_write_full(fd, &magic, sizeof(magic));
-    if (ret != sizeof(magic)) {
-        ret = -errno;
+    ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic));
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
         goto exit;
     }
-    ret = qemu_write_full(fd, &header, sizeof(header));
-    if (ret != sizeof(header)) {
-        ret = -errno;
+    ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header));
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
         goto exit;
     }
 
-    ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
+    ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9);
     if (ret < 0) {
-        ret = -errno;
+        error_setg(errp, "Could not truncate file");
         goto exit;
     }
 
     /* write grain directory */
-    lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
-    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
+    gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
+    gd_buf = g_malloc0(gd_buf_size);
+    for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
          i < gt_count; i++, tmp += gt_size) {
-        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
-        if (ret != sizeof(tmp)) {
-            ret = -errno;
-            goto exit;
-        }
+        gd_buf[i] = cpu_to_le32(tmp);
+    }
+    ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
+                      gd_buf, gd_buf_size);
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
+        goto exit;
     }
 
     /* write backup grain directory */
-    lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
-    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
+    for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
          i < gt_count; i++, tmp += gt_size) {
-        ret = qemu_write_full(fd, &tmp, sizeof(tmp));
-        if (ret != sizeof(tmp)) {
-            ret = -errno;
-            goto exit;
-        }
+        gd_buf[i] = cpu_to_le32(tmp);
+    }
+    ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
+                      gd_buf, gd_buf_size);
+    if (ret < 0) {
+        error_set(errp, QERR_IO_ERROR);
+        goto exit;
     }
 
     ret = 0;
- exit:
-    qemu_close(fd);
+exit:
+    if (bs) {
+        bdrv_unref(bs);
+    }
+    g_free(gd_buf);
     return ret;
 }
 
@@ -1604,7 +1620,9 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
 static int vmdk_create(const char *filename, QEMUOptionParameter *options,
                        Error **errp)
 {
-    int fd, idx = 0;
+    int idx = 0;
+    BlockDriverState *new_bs = NULL;
+    Error *local_err;
     char *desc = NULL;
     int64_t total_size = 0, filesize;
     const char *adapter_type = NULL;
@@ -1621,6 +1639,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
     uint32_t parent_cid = 0xffffffff;
     uint32_t number_heads = 16;
     bool zeroed_grain = false;
+    uint32_t desc_offset = 0, desc_len;
     const char desc_template[] =
         "# Disk DescriptorFile\n"
         "version=1\n"
@@ -1754,7 +1773,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
                 path, desc_filename);
 
         if (vmdk_create_extent(ext_filename, size,
-                               flat, compress, zeroed_grain)) {
+                               flat, compress, zeroed_grain, errp)) {
             ret = -EINVAL;
             goto exit;
         }
@@ -1762,7 +1781,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
 
         /* Format description line */
         snprintf(desc_line, sizeof(desc_line),
-                    desc_extent_line, size / 512, desc_filename);
+                    desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename);
         g_string_append(ext_desc_lines, desc_line);
     }
     /* generate descriptor file */
@@ -1773,36 +1792,43 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options,
                            parent_desc_line,
                            ext_desc_lines->str,
                            (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
-                           total_size / (int64_t)(63 * number_heads * 512),
+                           total_size /
+                               (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
                            number_heads,
                            adapter_type);
-    if (split || flat) {
-        fd = qemu_open(filename,
-                       O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
-                       0644);
+    desc_len = strlen(desc);
+    /* the descriptor offset = 0x200 */
+    if (!split && !flat) {
+        desc_offset = 0x200;
     } else {
-        fd = qemu_open(filename,
-                       O_WRONLY | O_BINARY | O_LARGEFILE,
-                       0644);
+        ret = bdrv_create_file(filename, options, &local_err);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Could not create image file");
+            goto exit;
+        }
     }
-    if (fd < 0) {
-        ret = -errno;
+    ret = bdrv_file_open(&new_bs, filename, NULL, BDRV_O_RDWR, &local_err);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not write description");
         goto exit;
     }
-    /* the descriptor offset = 0x200 */
-    if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) {
-        ret = -errno;
-        goto close_exit;
+    ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not write description");
+        goto exit;
     }
-    ret = qemu_write_full(fd, desc, strlen(desc));
-    if (ret != strlen(desc)) {
-        ret = -errno;
-        goto close_exit;
+    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
+     * for description file */
+    if (desc_offset == 0) {
+        ret = bdrv_truncate(new_bs, desc_len);
+        if (ret < 0) {
+            error_setg(errp, "Could not truncate file");
+        }
     }
-    ret = 0;
-close_exit:
-    qemu_close(fd);
 exit:
+    if (new_bs) {
+        bdrv_unref(new_bs);
+    }
     g_free(desc);
     g_string_free(ext_desc_lines, true);
     return ret;

From de99c417f6208a64b68e3b35d2aecbca1f60eae0 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 19 Dec 2013 16:26:44 +0100
Subject: [PATCH 12/18] qemu-iotests: drop duplicate virtio-blk initialization
 failure

Commit 75884afd5c6c42e523b08565e289dbe319e17ad9 ("virtio-blk: Convert to
QOM realize") dropped a duplicate error_report() call.  Now we no longer
get the following error message twice:

  QEMU_PROG: -drive if=virtio: Device initialization failed.

Update qemu-iotests 051.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/qemu-iotests/051.out | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/qemu-iotests/051.out b/tests/qemu-iotests/051.out
index 49e95a20cf..c2cadba2fc 100644
--- a/tests/qemu-iotests/051.out
+++ b/tests/qemu-iotests/051.out
@@ -91,7 +91,6 @@ Testing: -drive if=virtio
 QEMU X.Y.Z monitor - type 'help' for more information
 (qemu) QEMU_PROG: -drive if=virtio: Device needs media, but drive is empty
 QEMU_PROG: -drive if=virtio: Device initialization failed.
-QEMU_PROG: -drive if=virtio: Device initialization failed.
 QEMU_PROG: -drive if=virtio: Device 'virtio-blk-pci' could not be initialized
 
 Testing: -drive if=scsi

From f95c625ce4cb7863795fcc36502ac58a44fdb2f1 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Mon, 16 Dec 2013 14:45:28 +0800
Subject: [PATCH 13/18] mirror: Don't close target

Let reference count manage target and don't call bdrv_close here.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/mirror.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/mirror.c b/block/mirror.c
index 6dc27ad35d..5b2c119e61 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -482,7 +482,6 @@ immediate_exit:
         }
         bdrv_swap(s->target, s->common.bs);
     }
-    bdrv_close(s->target);
     bdrv_unref(s->target);
     block_job_completed(&s->common, ret);
 }

From 5bc361b8134eff68e2c40916d1cf58b3523d223b Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Mon, 16 Dec 2013 14:45:29 +0800
Subject: [PATCH 14/18] mirror: Move base to MirrorBlockJob

This allows setting the base before entering mirror_run, commit will
make use of it.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/mirror.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 5b2c119e61..605dda6093 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -31,6 +31,7 @@ typedef struct MirrorBlockJob {
     BlockJob common;
     RateLimit limit;
     BlockDriverState *target;
+    BlockDriverState *base;
     MirrorSyncMode mode;
     BlockdevOnError on_source_error, on_target_error;
     bool synced;
@@ -337,8 +338,7 @@ static void coroutine_fn mirror_run(void *opaque)
 
     if (s->mode != MIRROR_SYNC_MODE_NONE) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
-        BlockDriverState *base;
-        base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
+        BlockDriverState *base = s->base;
         for (sector_num = 0; sector_num < end; ) {
             int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
             ret = bdrv_is_allocated_above(bs, base,
@@ -543,6 +543,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   void *opaque, Error **errp)
 {
     MirrorBlockJob *s;
+    BlockDriverState *base = NULL;
 
     if (granularity == 0) {
         /* Choose the default granularity based on the target file's cluster
@@ -565,6 +566,12 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
+    if (mode == MIRROR_SYNC_MODE_TOP) {
+        base = bs->backing_hd;
+    } else {
+        base = NULL;
+    }
+
     s = block_job_create(&mirror_job_driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
@@ -574,6 +581,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     s->on_target_error = on_target_error;
     s->target = target;
     s->mode = mode;
+    s->base = base;
     s->granularity = granularity;
     s->buf_size = MAX(buf_size, granularity);
 

From 03544a6e9ecc1be115e8a29bd929f83b467d4816 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Mon, 16 Dec 2013 14:45:30 +0800
Subject: [PATCH 15/18] block: Add commit_active_start()

commit_active_start is implemented in block/mirror.c, It will create a
job with "commit" type and designated base in block-commit command. This
will be used for committing active layer of device.

Sync mode is removed from MirrorBlockJob because there's no proper type
for commit. The used information is is_none_mode.

The common part of mirror_start and commit_active_start is moved to
mirror_start_job().

Fix the comment wording for commit_start.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/mirror.c            | 66 +++++++++++++++++++++++++++++----------
 include/block/block_int.h | 22 +++++++++++--
 2 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 605dda6093..04af341be6 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -32,7 +32,7 @@ typedef struct MirrorBlockJob {
     RateLimit limit;
     BlockDriverState *target;
     BlockDriverState *base;
-    MirrorSyncMode mode;
+    bool is_none_mode;
     BlockdevOnError on_source_error, on_target_error;
     bool synced;
     bool should_complete;
@@ -336,7 +336,7 @@ static void coroutine_fn mirror_run(void *opaque)
     sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
     mirror_free_init(s);
 
-    if (s->mode != MIRROR_SYNC_MODE_NONE) {
+    if (!s->is_none_mode) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
         BlockDriverState *base = s->base;
         for (sector_num = 0; sector_num < end; ) {
@@ -535,15 +535,26 @@ static const BlockJobDriver mirror_job_driver = {
     .complete      = mirror_complete,
 };
 
-void mirror_start(BlockDriverState *bs, BlockDriverState *target,
-                  int64_t speed, int64_t granularity, int64_t buf_size,
-                  MirrorSyncMode mode, BlockdevOnError on_source_error,
-                  BlockdevOnError on_target_error,
-                  BlockDriverCompletionFunc *cb,
-                  void *opaque, Error **errp)
+static const BlockJobDriver commit_active_job_driver = {
+    .instance_size = sizeof(MirrorBlockJob),
+    .job_type      = BLOCK_JOB_TYPE_COMMIT,
+    .set_speed     = mirror_set_speed,
+    .iostatus_reset
+                   = mirror_iostatus_reset,
+    .complete      = mirror_complete,
+};
+
+static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
+                            int64_t speed, int64_t granularity,
+                            int64_t buf_size,
+                            BlockdevOnError on_source_error,
+                            BlockdevOnError on_target_error,
+                            BlockDriverCompletionFunc *cb,
+                            void *opaque, Error **errp,
+                            const BlockJobDriver *driver,
+                            bool is_none_mode, BlockDriverState *base)
 {
     MirrorBlockJob *s;
-    BlockDriverState *base = NULL;
 
     if (granularity == 0) {
         /* Choose the default granularity based on the target file's cluster
@@ -566,13 +577,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
-    if (mode == MIRROR_SYNC_MODE_TOP) {
-        base = bs->backing_hd;
-    } else {
-        base = NULL;
-    }
 
-    s = block_job_create(&mirror_job_driver, bs, speed, cb, opaque, errp);
+    s = block_job_create(driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
     }
@@ -580,7 +586,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     s->on_source_error = on_source_error;
     s->on_target_error = on_target_error;
     s->target = target;
-    s->mode = mode;
+    s->is_none_mode = is_none_mode;
     s->base = base;
     s->granularity = granularity;
     s->buf_size = MAX(buf_size, granularity);
@@ -593,3 +599,31 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     trace_mirror_start(bs, s, s->common.co, opaque);
     qemu_coroutine_enter(s->common.co, s);
 }
+
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  MirrorSyncMode mode, BlockdevOnError on_source_error,
+                  BlockdevOnError on_target_error,
+                  BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp)
+{
+    bool is_none_mode;
+    BlockDriverState *base;
+
+    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
+    base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
+    mirror_start_job(bs, target, speed, granularity, buf_size,
+                     on_source_error, on_target_error, cb, opaque, errp,
+                     &mirror_job_driver, is_none_mode, base);
+}
+
+void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
+                         int64_t speed,
+                         BlockdevOnError on_error,
+                         BlockDriverCompletionFunc *cb,
+                         void *opaque, Error **errp)
+{
+    mirror_start_job(bs, base, speed, 0, 0,
+                     on_error, on_error, cb, opaque, errp,
+                     &commit_active_job_driver, false, base);
+}
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 8b132d7178..2772f2f1bd 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -394,8 +394,9 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
 
 /**
  * commit_start:
- * @bs: Top Block device
- * @base: Block device that will be written into, and become the new top
+ * @bs: Active block device.
+ * @top: Top block device to be committed.
+ * @base: Block device that will be written into, and become the new top.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
  * @cb: Completion function for the job.
@@ -407,7 +408,22 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
                  BlockDriverState *top, int64_t speed,
                  BlockdevOnError on_error, BlockDriverCompletionFunc *cb,
                  void *opaque, Error **errp);
-
+/**
+ * commit_active_start:
+ * @bs: Active block device to be committed.
+ * @base: Block device that will be written into, and become the new top.
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ */
+void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
+                         int64_t speed,
+                         BlockdevOnError on_error,
+                         BlockDriverCompletionFunc *cb,
+                         void *opaque, Error **errp);
 /*
  * mirror_start:
  * @bs: Block device to operate on.

From 20a63d2cec838c2dde4d246c4d7abe747d9b7a11 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Mon, 16 Dec 2013 14:45:31 +0800
Subject: [PATCH 16/18] commit: Support commit active layer

If active is top, it will be mirrored to base, (with block/mirror.c
code), then the image is switched when user completes the block job.

QMP documentation is updated.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/mirror.c   | 11 +++++++++++
 blockdev.c       |  9 +++++++--
 qapi-schema.json |  5 +++--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 04af341be6..2932bab27a 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -481,6 +481,13 @@ immediate_exit:
             bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL);
         }
         bdrv_swap(s->target, s->common.bs);
+        if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
+            /* drop the bs loop chain formed by the swap: break the loop then
+             * trigger the unref from the top one */
+            BlockDriverState *p = s->base->backing_hd;
+            s->base->backing_hd = NULL;
+            bdrv_unref(p);
+        }
     }
     bdrv_unref(s->target);
     block_job_completed(&s->common, ret);
@@ -623,6 +630,10 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
                          BlockDriverCompletionFunc *cb,
                          void *opaque, Error **errp)
 {
+    if (bdrv_reopen(base, bs->open_flags, errp)) {
+        return;
+    }
+    bdrv_ref(base);
     mirror_start_job(bs, base, speed, 0, 0,
                      on_error, on_error, cb, opaque, errp,
                      &commit_active_job_driver, false, base);
diff --git a/blockdev.c b/blockdev.c
index 6a85961af2..2c3242b87a 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1820,8 +1820,13 @@ void qmp_block_commit(const char *device,
         return;
     }
 
-    commit_start(bs, base_bs, top_bs, speed, on_error, block_job_cb, bs,
-                &local_err);
+    if (top_bs == bs) {
+        commit_active_start(bs, base_bs, speed, on_error, block_job_cb,
+                            bs, &local_err);
+    } else {
+        commit_start(bs, base_bs, top_bs, speed, on_error, block_job_cb, bs,
+                    &local_err);
+    }
     if (local_err != NULL) {
         error_propagate(errp, local_err);
         return;
diff --git a/qapi-schema.json b/qapi-schema.json
index 5aa4581792..b9b051c1fa 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1967,9 +1967,11 @@
 #
 # @top:              The file name of the backing image within the image chain,
 #                    which contains the topmost data to be committed down.
-#                    Note, the active layer as 'top' is currently unsupported.
 #
 #                    If top == base, that is an error.
+#                    If top == active, the job will not be completed by itself,
+#                    user needs to complete the job with the block-job-complete
+#                    command after getting the ready event. (Since 2.0)
 #
 #
 # @speed:  #optional the maximum speed, in bytes per second
@@ -1979,7 +1981,6 @@
 #          If @device does not exist, DeviceNotFound
 #          If image commit is not supported by this device, NotSupported
 #          If @base or @top is invalid, a generic error is returned
-#          If @top is the active layer, or omitted, a generic error is returned
 #          If @speed is invalid, InvalidParameter
 #
 # Since: 1.3

From 4de43470f2f35762b4b3e6a59b4aed55e239024a Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Mon, 16 Dec 2013 14:45:32 +0800
Subject: [PATCH 17/18] qemu-iotests: Update test cases for commit active

Factor out commit test common logic into super class, and update test
of committing the active image.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/qemu-iotests/040 | 74 ++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 42 deletions(-)

diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040
index 18dcd61ef2..72eaad5b08 100755
--- a/tests/qemu-iotests/040
+++ b/tests/qemu-iotests/040
@@ -39,6 +39,29 @@ class ImageCommitTestCase(iotests.QMPTestCase):
         result = self.vm.qmp('query-block-jobs')
         self.assert_qmp(result, 'return', [])
 
+    def run_commit_test(self, top, base):
+        self.assert_no_active_commit()
+        result = self.vm.qmp('block-commit', device='drive0', top=top, base=base)
+        self.assert_qmp(result, 'return', {})
+
+        completed = False
+        while not completed:
+            for event in self.vm.get_qmp_events(wait=True):
+                if event['event'] == 'BLOCK_JOB_COMPLETED':
+                    self.assert_qmp(event, 'data/type', 'commit')
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/offset', self.image_len)
+                    self.assert_qmp(event, 'data/len', self.image_len)
+                    completed = True
+                elif event['event'] == 'BLOCK_JOB_READY':
+                    self.assert_qmp(event, 'data/type', 'commit')
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/len', self.image_len)
+                    self.vm.qmp('block-job-complete', device='drive0')
+
+        self.assert_no_active_commit()
+        self.vm.shutdown()
+
 class TestSingleDrive(ImageCommitTestCase):
     image_len = 1 * 1024 * 1024
     test_len = 1 * 1024 * 256
@@ -59,23 +82,7 @@ class TestSingleDrive(ImageCommitTestCase):
         os.remove(backing_img)
 
     def test_commit(self):
-        self.assert_no_active_commit()
-        result = self.vm.qmp('block-commit', device='drive0', top='%s' % mid_img)
-        self.assert_qmp(result, 'return', {})
-
-        completed = False
-        while not completed:
-            for event in self.vm.get_qmp_events(wait=True):
-                if event['event'] == 'BLOCK_JOB_COMPLETED':
-                    self.assert_qmp(event, 'data/type', 'commit')
-                    self.assert_qmp(event, 'data/device', 'drive0')
-                    self.assert_qmp(event, 'data/offset', self.image_len)
-                    self.assert_qmp(event, 'data/len', self.image_len)
-                    completed = True
-
-        self.assert_no_active_commit()
-        self.vm.shutdown()
-
+        self.run_commit_test(mid_img, backing_img)
         self.assertEqual(-1, qemu_io('-c', 'read -P 0xab 0 524288', backing_img).find("verification failed"))
         self.assertEqual(-1, qemu_io('-c', 'read -P 0xef 524288 524288', backing_img).find("verification failed"))
 
@@ -102,10 +109,9 @@ class TestSingleDrive(ImageCommitTestCase):
         self.assert_qmp(result, 'error/desc', 'Base \'badfile\' not found')
 
     def test_top_is_active(self):
-        self.assert_no_active_commit()
-        result = self.vm.qmp('block-commit', device='drive0', top='%s' % test_img, base='%s' % backing_img)
-        self.assert_qmp(result, 'error/class', 'GenericError')
-        self.assert_qmp(result, 'error/desc', 'Top image as the active layer is currently unsupported')
+        self.run_commit_test(test_img, backing_img)
+        self.assertEqual(-1, qemu_io('-c', 'read -P 0xab 0 524288', backing_img).find("verification failed"))
+        self.assertEqual(-1, qemu_io('-c', 'read -P 0xef 524288 524288', backing_img).find("verification failed"))
 
     def test_top_and_base_reversed(self):
         self.assert_no_active_commit()
@@ -166,23 +172,7 @@ class TestRelativePaths(ImageCommitTestCase):
                 raise
 
     def test_commit(self):
-        self.assert_no_active_commit()
-        result = self.vm.qmp('block-commit', device='drive0', top='%s' % self.mid_img)
-        self.assert_qmp(result, 'return', {})
-
-        completed = False
-        while not completed:
-            for event in self.vm.get_qmp_events(wait=True):
-                if event['event'] == 'BLOCK_JOB_COMPLETED':
-                    self.assert_qmp(event, 'data/type', 'commit')
-                    self.assert_qmp(event, 'data/device', 'drive0')
-                    self.assert_qmp(event, 'data/offset', self.image_len)
-                    self.assert_qmp(event, 'data/len', self.image_len)
-                    completed = True
-
-        self.assert_no_active_commit()
-        self.vm.shutdown()
-
+        self.run_commit_test(self.mid_img, self.backing_img)
         self.assertEqual(-1, qemu_io('-c', 'read -P 0xab 0 524288', self.backing_img_abs).find("verification failed"))
         self.assertEqual(-1, qemu_io('-c', 'read -P 0xef 524288 524288', self.backing_img_abs).find("verification failed"))
 
@@ -209,10 +199,9 @@ class TestRelativePaths(ImageCommitTestCase):
         self.assert_qmp(result, 'error/desc', 'Base \'badfile\' not found')
 
     def test_top_is_active(self):
-        self.assert_no_active_commit()
-        result = self.vm.qmp('block-commit', device='drive0', top='%s' % self.test_img, base='%s' % self.backing_img)
-        self.assert_qmp(result, 'error/class', 'GenericError')
-        self.assert_qmp(result, 'error/desc', 'Top image as the active layer is currently unsupported')
+        self.run_commit_test(self.test_img, self.backing_img)
+        self.assertEqual(-1, qemu_io('-c', 'read -P 0xab 0 524288', self.backing_img_abs).find("verification failed"))
+        self.assertEqual(-1, qemu_io('-c', 'read -P 0xef 524288 524288', self.backing_img_abs).find("verification failed"))
 
     def test_top_and_base_reversed(self):
         self.assert_no_active_commit()
@@ -229,6 +218,7 @@ class TestSetSpeed(ImageCommitTestCase):
         qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % backing_img, mid_img)
         qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % mid_img, test_img)
         qemu_io('-c', 'write -P 0x1 0 512', test_img)
+        qemu_io('-c', 'write -P 0xef 524288 524288', mid_img)
         self.vm = iotests.VM().add_drive(test_img)
         self.vm.launch()
 

From 18da7f94cdce130f2a71387de4980ffa817181a1 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Mon, 16 Dec 2013 14:45:33 +0800
Subject: [PATCH 18/18] commit: Remove unused check

We support top == active for commit now, remove the check and add an
assertion here.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/commit.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index d4090cbf7d..acec4ac5a8 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -198,13 +198,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
         return;
     }
 
-    /* Once we support top == active layer, remove this check */
-    if (top == bs) {
-        error_setg(errp,
-                   "Top image as the active layer is currently unsupported");
-        return;
-    }
-
+    assert(top != bs);
     if (top == base) {
         error_setg(errp, "Invalid files for merge: top and base are the same");
         return;