From 86f425db3b1c4b6c4a2927eaec35627f9ab2e703 Mon Sep 17 00:00:00 2001 From: Alex Bligh Date: Fri, 5 Apr 2013 15:45:10 +0000 Subject: [PATCH 1/4] Xen PV backend: Move call to bdrv_new from blk_init to blk_connect This commit delays the point at which bdrv_new (and hence blk_open on the underlying device) is called from blk_init to blk_connect. This ensures that in an inbound live migrate, the block device is not opened until it has been closed at the other end. This is in preparation for supporting devices with open/close consistency without using O_DIRECT. This commit does NOT itself change O_DIRECT semantics. Signed-off-by: Alex Bligh Signed-off-by: Stefano Stabellini --- hw/xen_disk.c | 100 +++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/hw/xen_disk.c b/hw/xen_disk.c index 83329e2e69..24e8b2491a 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -700,7 +700,7 @@ static void blk_alloc(struct XenDevice *xendev) static int blk_init(struct XenDevice *xendev) { struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); - int index, qflags, info = 0; + int info = 0; /* read xenstore entries */ if (blkdev->params == NULL) { @@ -743,10 +743,7 @@ static int blk_init(struct XenDevice *xendev) } /* read-only ? */ - qflags = BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NATIVE_AIO; - if (strcmp(blkdev->mode, "w") == 0) { - qflags |= BDRV_O_RDWR; - } else { + if (strcmp(blkdev->mode, "w")) { info |= VDISK_READONLY; } @@ -755,50 +752,14 @@ static int blk_init(struct XenDevice *xendev) info |= VDISK_CDROM; } - /* init qemu block driver */ - index = (blkdev->xendev.dev - 202 * 256) / 16; - blkdev->dinfo = drive_get(IF_XEN, 0, index); - if (!blkdev->dinfo) { - /* setup via xenbus -> create new block driver instance */ - xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n"); - blkdev->bs = bdrv_new(blkdev->dev); - if (blkdev->bs) { - if (bdrv_open(blkdev->bs, blkdev->filename, NULL, qflags, - bdrv_find_whitelisted_format(blkdev->fileproto)) != 0) { - bdrv_delete(blkdev->bs); - blkdev->bs = NULL; - } - } - if (!blkdev->bs) { - goto out_error; - } - } else { - /* setup via qemu cmdline -> already setup for us */ - xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n"); - blkdev->bs = blkdev->dinfo->bdrv; - } - bdrv_attach_dev_nofail(blkdev->bs, blkdev); blkdev->file_blk = BLOCK_SIZE; - blkdev->file_size = bdrv_getlength(blkdev->bs); - if (blkdev->file_size < 0) { - xen_be_printf(&blkdev->xendev, 1, "bdrv_getlength: %d (%s) | drv %s\n", - (int)blkdev->file_size, strerror(-blkdev->file_size), - bdrv_get_format_name(blkdev->bs) ?: "-"); - blkdev->file_size = 0; - } - xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\"," - " size %" PRId64 " (%" PRId64 " MB)\n", - blkdev->type, blkdev->fileproto, blkdev->filename, - blkdev->file_size, blkdev->file_size >> 20); - - /* fill info */ + /* fill info + * blk_connect supplies sector-size and sectors + */ xenstore_write_be_int(&blkdev->xendev, "feature-flush-cache", 1); xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1); - xenstore_write_be_int(&blkdev->xendev, "info", info); - xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk); - xenstore_write_be_int(&blkdev->xendev, "sectors", - blkdev->file_size / blkdev->file_blk); + xenstore_write_be_int(&blkdev->xendev, "info", info); return 0; out_error: @@ -818,7 +779,54 @@ out_error: static int blk_connect(struct XenDevice *xendev) { struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); - int pers; + int pers, index, qflags; + + /* read-only ? */ + qflags = BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NATIVE_AIO; + if (strcmp(blkdev->mode, "w") == 0) { + qflags |= BDRV_O_RDWR; + } + + /* init qemu block driver */ + index = (blkdev->xendev.dev - 202 * 256) / 16; + blkdev->dinfo = drive_get(IF_XEN, 0, index); + if (!blkdev->dinfo) { + /* setup via xenbus -> create new block driver instance */ + xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n"); + blkdev->bs = bdrv_new(blkdev->dev); + if (blkdev->bs) { + if (bdrv_open(blkdev->bs, blkdev->filename, NULL, qflags, + bdrv_find_whitelisted_format(blkdev->fileproto)) != 0) { + bdrv_delete(blkdev->bs); + blkdev->bs = NULL; + } + } + if (!blkdev->bs) { + return -1; + } + } else { + /* setup via qemu cmdline -> already setup for us */ + xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n"); + blkdev->bs = blkdev->dinfo->bdrv; + } + bdrv_attach_dev_nofail(blkdev->bs, blkdev); + blkdev->file_size = bdrv_getlength(blkdev->bs); + if (blkdev->file_size < 0) { + xen_be_printf(&blkdev->xendev, 1, "bdrv_getlength: %d (%s) | drv %s\n", + (int)blkdev->file_size, strerror(-blkdev->file_size), + bdrv_get_format_name(blkdev->bs) ?: "-"); + blkdev->file_size = 0; + } + + xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\"," + " size %" PRId64 " (%" PRId64 " MB)\n", + blkdev->type, blkdev->fileproto, blkdev->filename, + blkdev->file_size, blkdev->file_size >> 20); + + /* Fill in number of sector size and number of sectors */ + xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk); + xenstore_write_be_int(&blkdev->xendev, "sectors", + blkdev->file_size / blkdev->file_blk); if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) { return -1; From c1a88ad1f4ac994cd70695bf08141d161e21533e Mon Sep 17 00:00:00 2001 From: Alex Bligh Date: Fri, 5 Apr 2013 15:45:15 +0000 Subject: [PATCH 2/4] Xen PV backend: Disable use of O_DIRECT by default as it results in crashes. Due to what is almost certainly a kernel bug, writes with O_DIRECT may continue to reference the page after the write has been marked as completed, particularly in the case of TCP retransmit. In other scenarios, this "merely" risks data corruption on the write, but with Xen pages from domU are only transiently mapped into dom0's memory, resulting in kernel panics when they are subsequently accessed. This brings PV devices in line with emulated devices. Removing O_DIRECT is safe as barrier operations are now correctly passed through. See: http://lists.xen.org/archives/html/xen-devel/2012-12/msg01154.html for more details. Signed-off-by: Alex Bligh Signed-off-by: Stefano Stabellini --- hw/xen_disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/xen_disk.c b/hw/xen_disk.c index 24e8b2491a..1a30f0a9ed 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -782,7 +782,7 @@ static int blk_connect(struct XenDevice *xendev) int pers, index, qflags; /* read-only ? */ - qflags = BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NATIVE_AIO; + qflags = BDRV_O_CACHE_WB | BDRV_O_NATIVE_AIO; if (strcmp(blkdev->mode, "w") == 0) { qflags |= BDRV_O_RDWR; } From 10bb3c623478117aee5117c312736f10833decc2 Mon Sep 17 00:00:00 2001 From: Felipe Franciosi Date: Fri, 5 Apr 2013 15:37:32 +0000 Subject: [PATCH 3/4] Introduce 64 bit integer write interface to xenstore The current implementation of xen_backend only provides 32 bit integer functions to write to xenstore. This patch adds two functions that allow writing 64 bit integers (one generic function and another for the backend only). This patch also fixes the size of the char arrays used to represent these integers as strings (originally 32 bytes, however no more than 12 bytes are needed for 32 bit integers and no more than 21 bytes are needed for 64 bit integers). Signed-off-by: Felipe Franciosi Signed-off-by: Stefano Stabellini --- hw/xen_backend.c | 15 ++++++++++++++- hw/xen_backend.h | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/hw/xen_backend.c b/hw/xen_backend.c index 24381b55e5..02693d7565 100644 --- a/hw/xen_backend.c +++ b/hw/xen_backend.c @@ -85,12 +85,20 @@ char *xenstore_read_str(const char *base, const char *node) int xenstore_write_int(const char *base, const char *node, int ival) { - char val[32]; + char val[12]; snprintf(val, sizeof(val), "%d", ival); return xenstore_write_str(base, node, val); } +int xenstore_write_int64(const char *base, const char *node, int64_t ival) +{ + char val[21]; + + snprintf(val, sizeof(val), "%"PRId64, ival); + return xenstore_write_str(base, node, val); +} + int xenstore_read_int(const char *base, const char *node, int *ival) { char *val; @@ -114,6 +122,11 @@ int xenstore_write_be_int(struct XenDevice *xendev, const char *node, int ival) return xenstore_write_int(xendev->be, node, ival); } +int xenstore_write_be_int64(struct XenDevice *xendev, const char *node, int64_t ival) +{ + return xenstore_write_int64(xendev->be, node, ival); +} + char *xenstore_read_be_str(struct XenDevice *xendev, const char *node) { return xenstore_read_str(xendev->be, node); diff --git a/hw/xen_backend.h b/hw/xen_backend.h index 6d5c699c51..d04b985d10 100644 --- a/hw/xen_backend.h +++ b/hw/xen_backend.h @@ -63,11 +63,13 @@ extern const char *xen_protocol; /* xenstore helper functions */ int xenstore_write_str(const char *base, const char *node, const char *val); int xenstore_write_int(const char *base, const char *node, int ival); +int xenstore_write_int64(const char *base, const char *node, int64_t ival); char *xenstore_read_str(const char *base, const char *node); int xenstore_read_int(const char *base, const char *node, int *ival); int xenstore_write_be_str(struct XenDevice *xendev, const char *node, const char *val); int xenstore_write_be_int(struct XenDevice *xendev, const char *node, int ival); +int xenstore_write_be_int64(struct XenDevice *xendev, const char *node, int64_t ival); char *xenstore_read_be_str(struct XenDevice *xendev, const char *node); int xenstore_read_be_int(struct XenDevice *xendev, const char *node, int *ival); char *xenstore_read_fe_str(struct XenDevice *xendev, const char *node); From 9246ce881128df2a69178779c1ef33c83df3c70d Mon Sep 17 00:00:00 2001 From: Felipe Franciosi Date: Fri, 5 Apr 2013 15:47:59 +0000 Subject: [PATCH 4/4] Allow xen guests to plug disks of 1 TiB or more The current xen backend driver implementation uses int64_t variables to store the size of the corresponding backend disk/file. It also uses an int64_t variable to store the block size of that image. When writing the number of sectors (file_size/block_size) to xenstore, however, it passes these values as 32 bit signed integers. This will cause an overflow for any disk of 1 TiB or more. This patch changes the xen backend driver to use a 64 bit integer write xenstore function. Signed-off-by: Felipe Franciosi Signed-off-by: Stefano Stabellini --- hw/xen_disk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/xen_disk.c b/hw/xen_disk.c index 1a30f0a9ed..47a51cf014 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -825,8 +825,8 @@ static int blk_connect(struct XenDevice *xendev) /* Fill in number of sector size and number of sectors */ xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk); - xenstore_write_be_int(&blkdev->xendev, "sectors", - blkdev->file_size / blkdev->file_blk); + xenstore_write_be_int64(&blkdev->xendev, "sectors", + blkdev->file_size / blkdev->file_blk); if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) { return -1;