From 4fc16838b8392a29644d4d2c01495e6ff447a6f0 Mon Sep 17 00:00:00 2001 From: "Richard W.M. Jones" Date: Fri, 19 Apr 2013 09:16:39 +0100 Subject: [PATCH 01/11] block/ssh: Require libssh2 >= 1.2.8. libssh2 >= 1.2.8 is required to enable this block device (because that version introduced the libssh2_session_handshake call). Change the test to use pkg-config exclusively. If the user requests --enable-libssh2 and the minimum version is not available, then the following error is displayed: $ ./configure --enable-libssh2 ERROR: libssh2 >= 1.2.8 required for --enable-libssh2 If --enable-libssh2 is not specified, then the feature is silently disabled if sufficiently new libssh2 is not available. Signed-off-by: Stefan Hajnoczi --- configure | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/configure b/configure index ee2e7e8ad9..8188a7c05f 100755 --- a/configure +++ b/configure @@ -2364,35 +2364,19 @@ fi ########################################## # libssh2 probe +min_libssh2_version=1.2.8 if test "$libssh2" != "no" ; then - cat > $TMPC < -#include -#include -int main(void) { - LIBSSH2_SESSION *session; - session = libssh2_session_init (); - (void) libssh2_sftp_init (session); - return 0; -} -EOF - - if $pkg_config libssh2 --modversion >/dev/null 2>&1; then + if $pkg_config --atleast-version=$min_libssh2_version libssh2 >/dev/null 2>&1 + then libssh2_cflags=`$pkg_config libssh2 --cflags` libssh2_libs=`$pkg_config libssh2 --libs` - else - libssh2_cflags= - libssh2_libs="-lssh2" - fi - - if compile_prog "$libssh2_cflags" "$libssh2_libs" ; then libssh2=yes libs_tools="$libssh2_libs $libs_tools" libs_softmmu="$libssh2_libs $libs_softmmu" QEMU_CFLAGS="$QEMU_CFLAGS $libssh2_cflags" else if test "$libssh2" = "yes" ; then - feature_not_found "libssh2" + error_exit "libssh2 >= $min_libssh2_version required for --enable-libssh2" fi libssh2=no fi From cac8f4a60fc5c372bacd59eeff0646955fb4f246 Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Tue, 23 Apr 2013 14:03:33 +0800 Subject: [PATCH 02/11] sheepdog: add discard/trim support for sheepdog The 'TRIM' command from VM that is to release underlying data storage for better thin-provision is already supported by the Sheepdog. This patch adds the TRIM support at QEMU part. For older Sheepdog that doesn't support it, we return 0(success) to upper layer. Cc: MORITA Kazutaka Cc: Kevin Wolf Cc: Stefan Hajnoczi Cc: Paolo Bonzini Signed-off-by: Liu Yuan Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 20b5d06c52..6a72c1ee40 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -27,6 +27,8 @@ #define SD_OP_CREATE_AND_WRITE_OBJ 0x01 #define SD_OP_READ_OBJ 0x02 #define SD_OP_WRITE_OBJ 0x03 +/* 0x04 is used internally by Sheepdog */ +#define SD_OP_DISCARD_OBJ 0x05 #define SD_OP_NEW_VDI 0x11 #define SD_OP_LOCK_VDI 0x12 @@ -269,6 +271,7 @@ enum AIOCBState { AIOCB_WRITE_UDATA, AIOCB_READ_UDATA, AIOCB_FLUSH_CACHE, + AIOCB_DISCARD_OBJ, }; struct SheepdogAIOCB { @@ -298,6 +301,7 @@ typedef struct BDRVSheepdogState { char name[SD_MAX_VDI_LEN]; bool is_snapshot; uint32_t cache_flags; + bool discard_supported; char *host_spec; bool is_unix; @@ -656,7 +660,7 @@ static void coroutine_fn aio_read_response(void *opaque) int ret; AIOReq *aio_req = NULL; SheepdogAIOCB *acb; - unsigned long idx; + uint64_t idx; if (QLIST_EMPTY(&s->inflight_aio_head)) { goto out; @@ -727,6 +731,21 @@ static void coroutine_fn aio_read_response(void *opaque) rsp.result = SD_RES_SUCCESS; } break; + case AIOCB_DISCARD_OBJ: + switch (rsp.result) { + case SD_RES_INVALID_PARMS: + error_report("sheep(%s) doesn't support discard command", + s->host_spec); + rsp.result = SD_RES_SUCCESS; + s->discard_supported = false; + break; + case SD_RES_SUCCESS: + idx = data_oid_to_idx(aio_req->oid); + s->inode.data_vdi_id[idx] = 0; + break; + default: + break; + } } if (rsp.result != SD_RES_SUCCESS) { @@ -1016,6 +1035,9 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, wlen = datalen; hdr.flags = SD_FLAG_CMD_WRITE | flags; break; + case AIOCB_DISCARD_OBJ: + hdr.opcode = SD_OP_DISCARD_OBJ; + break; } if (s->cache_flags) { @@ -1197,6 +1219,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) if (flags & BDRV_O_NOCACHE) { s->cache_flags = SD_FLAG_CMD_DIRECT; } + s->discard_supported = true; if (snapid || tag[0] != '\0') { dprintf("%" PRIx32 " snapshot inode was open.\n", vid); @@ -1662,6 +1685,15 @@ static int coroutine_fn sd_co_rw_vector(void *p) flags = SD_FLAG_CMD_COW; } break; + case AIOCB_DISCARD_OBJ: + /* + * We discard the object only when the whole object is + * 1) allocated 2) trimmed. Otherwise, simply skip it. + */ + if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) { + goto done; + } + break; default: break; } @@ -2107,6 +2139,33 @@ static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data, } +static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + SheepdogAIOCB *acb; + QEMUIOVector dummy; + BDRVSheepdogState *s = bs->opaque; + int ret; + + if (!s->discard_supported) { + return 0; + } + + acb = sd_aio_setup(bs, &dummy, sector_num, nb_sectors); + acb->aiocb_type = AIOCB_DISCARD_OBJ; + acb->aio_done_func = sd_finish_aiocb; + + ret = sd_co_rw_vector(acb); + if (ret <= 0) { + qemu_aio_release(acb); + return ret; + } + + qemu_coroutine_yield(); + + return acb->ret; +} + static QEMUOptionParameter sd_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -2139,6 +2198,7 @@ static BlockDriver bdrv_sheepdog = { .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + .bdrv_co_discard = sd_co_discard, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -2164,6 +2224,7 @@ static BlockDriver bdrv_sheepdog_tcp = { .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + .bdrv_co_discard = sd_co_discard, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -2189,6 +2250,7 @@ static BlockDriver bdrv_sheepdog_unix = { .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + .bdrv_co_discard = sd_co_discard, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, From e8bfaa2faeb7c9585a5586aafaad5f3affc37814 Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Tue, 23 Apr 2013 14:03:34 +0800 Subject: [PATCH 03/11] sheepdog: use BDRV_SECTOR_SIZE Cc: MORITA Kazutaka Cc: Kevin Wolf Cc: Stefan Hajnoczi Signed-off-by: Liu Yuan Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 6a72c1ee40..2772e8e3e8 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -89,7 +89,6 @@ #define SD_NR_VDIS (1U << 24) #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) -#define SECTOR_SIZE 512 #define SD_INODE_SIZE (sizeof(SheepdogInode)) #define CURRENT_VDI_ID 0 @@ -1246,7 +1245,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) s->min_dirty_data_idx = UINT32_MAX; s->max_dirty_data_idx = 0; - bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE; + bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE; pstrcpy(s->name, sizeof(s->name), vdi); qemu_co_mutex_init(&s->lock); qemu_opts_del(opts); @@ -1633,10 +1632,10 @@ static int coroutine_fn sd_co_rw_vector(void *p) { SheepdogAIOCB *acb = p; int ret = 0; - unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE; - unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE; + unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE; + unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE; uint64_t oid; - uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE; + uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE; BDRVSheepdogState *s = acb->common.bs->opaque; SheepdogInode *inode = &s->inode; AIOReq *aio_req; @@ -1755,7 +1754,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, int ret; if (bs->growable && sector_num + nb_sectors > bs->total_sectors) { - ret = sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE); + ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE); if (ret < 0) { return ret; } From 8d71c63137600a41b5b959217c0492278536b3dc Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Tue, 23 Apr 2013 14:03:35 +0800 Subject: [PATCH 04/11] sheepdog: implement .bdrv_co_is_allocated() Cc: MORITA Kazutaka Cc: Kevin Wolf Cc: Stefan Hajnoczi Signed-off-by: Liu Yuan Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/block/sheepdog.c b/block/sheepdog.c index 2772e8e3e8..9f30a87e32 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -2165,6 +2165,40 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, return acb->ret; } +static coroutine_fn int +sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + int *pnum) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogInode *inode = &s->inode; + unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE, + end = DIV_ROUND_UP((sector_num + nb_sectors) * + BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE); + unsigned long idx; + int ret = 1; + + for (idx = start; idx < end; idx++) { + if (inode->data_vdi_id[idx] == 0) { + break; + } + } + if (idx == start) { + /* Get the longest length of unallocated sectors */ + ret = 0; + for (idx = start + 1; idx < end; idx++) { + if (inode->data_vdi_id[idx] != 0) { + break; + } + } + } + + *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE; + if (*pnum > nb_sectors) { + *pnum = nb_sectors; + } + return ret; +} + static QEMUOptionParameter sd_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -2198,6 +2232,7 @@ static BlockDriver bdrv_sheepdog = { .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_discard = sd_co_discard, + .bdrv_co_is_allocated = sd_co_is_allocated, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -2224,6 +2259,7 @@ static BlockDriver bdrv_sheepdog_tcp = { .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_discard = sd_co_discard, + .bdrv_co_is_allocated = sd_co_is_allocated, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -2250,6 +2286,7 @@ static BlockDriver bdrv_sheepdog_unix = { .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_discard = sd_co_discard, + .bdrv_co_is_allocated = sd_co_is_allocated, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, From 8ec7d390b0d50b5e5b4b1d8dba7ba40d64a70875 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 24 Apr 2013 15:29:29 +0200 Subject: [PATCH 05/11] block: Disable driver-specific options for 1.5 We don't want to commit to the API yet before everything is worked out. Disable it for the 1.5 release. This commit is meant to be reverted after the 1.5 release. The disabling of the driver-specific options is achieved by applying the old checks while parsing the command line. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Signed-off-by: Stefan Hajnoczi --- blockdev.c | 118 +++++++++++++++++++++++++++++++++++++-- tests/qemu-iotests/group | 2 +- 2 files changed, 115 insertions(+), 5 deletions(-) diff --git a/blockdev.c b/blockdev.c index 8a1652b722..6e293e9550 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1656,10 +1656,120 @@ QemuOptsList qemu_drive_opts = { .name = "drive", .head = QTAILQ_HEAD_INITIALIZER(qemu_drive_opts.head), .desc = { - /* - * no elements => accept any params - * validation will happen later - */ + { + .name = "bus", + .type = QEMU_OPT_NUMBER, + .help = "bus number", + },{ + .name = "unit", + .type = QEMU_OPT_NUMBER, + .help = "unit number (i.e. lun for scsi)", + },{ + .name = "if", + .type = QEMU_OPT_STRING, + .help = "interface (ide, scsi, sd, mtd, floppy, pflash, virtio)", + },{ + .name = "index", + .type = QEMU_OPT_NUMBER, + .help = "index number", + },{ + .name = "cyls", + .type = QEMU_OPT_NUMBER, + .help = "number of cylinders (ide disk geometry)", + },{ + .name = "heads", + .type = QEMU_OPT_NUMBER, + .help = "number of heads (ide disk geometry)", + },{ + .name = "secs", + .type = QEMU_OPT_NUMBER, + .help = "number of sectors (ide disk geometry)", + },{ + .name = "trans", + .type = QEMU_OPT_STRING, + .help = "chs translation (auto, lba. none)", + },{ + .name = "media", + .type = QEMU_OPT_STRING, + .help = "media type (disk, cdrom)", + },{ + .name = "snapshot", + .type = QEMU_OPT_BOOL, + .help = "enable/disable snapshot mode", + },{ + .name = "file", + .type = QEMU_OPT_STRING, + .help = "disk image", + },{ + .name = "discard", + .type = QEMU_OPT_STRING, + .help = "discard operation (ignore/off, unmap/on)", + },{ + .name = "cache", + .type = QEMU_OPT_STRING, + .help = "host cache usage (none, writeback, writethrough, " + "directsync, unsafe)", + },{ + .name = "aio", + .type = QEMU_OPT_STRING, + .help = "host AIO implementation (threads, native)", + },{ + .name = "format", + .type = QEMU_OPT_STRING, + .help = "disk format (raw, qcow2, ...)", + },{ + .name = "serial", + .type = QEMU_OPT_STRING, + .help = "disk serial number", + },{ + .name = "rerror", + .type = QEMU_OPT_STRING, + .help = "read error action", + },{ + .name = "werror", + .type = QEMU_OPT_STRING, + .help = "write error action", + },{ + .name = "addr", + .type = QEMU_OPT_STRING, + .help = "pci address (virtio only)", + },{ + .name = "readonly", + .type = QEMU_OPT_BOOL, + .help = "open drive file as read-only", + },{ + .name = "iops", + .type = QEMU_OPT_NUMBER, + .help = "limit total I/O operations per second", + },{ + .name = "iops_rd", + .type = QEMU_OPT_NUMBER, + .help = "limit read operations per second", + },{ + .name = "iops_wr", + .type = QEMU_OPT_NUMBER, + .help = "limit write operations per second", + },{ + .name = "bps", + .type = QEMU_OPT_NUMBER, + .help = "limit total bytes per second", + },{ + .name = "bps_rd", + .type = QEMU_OPT_NUMBER, + .help = "limit read bytes per second", + },{ + .name = "bps_wr", + .type = QEMU_OPT_NUMBER, + .help = "limit write bytes per second", + },{ + .name = "copy-on-read", + .type = QEMU_OPT_BOOL, + .help = "copy read data from backing file into image file", + },{ + .name = "boot", + .type = QEMU_OPT_BOOL, + .help = "(deprecated, ignored)", + }, { /* end of list */ } }, }; diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 68eabdaab9..bf944d9123 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -57,6 +57,6 @@ 048 img auto quick 049 rw auto 050 rw auto backing quick -051 rw auto +#051 rw auto 052 rw auto backing 053 rw auto From c3ca988d2b0ee94dc8d53eff4b1c2de4ac06a270 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 Apr 2013 15:59:27 +0200 Subject: [PATCH 06/11] rbd: Fix use after free in rbd_open() Commit a9ccedc3 frees the QemuOpts for the driver-specific options immediately, even though it still needs the filename string that is contained there. This doesn't work. Move the deletion of the QemuOpts to the end of the function where its content isn't needed any more. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Signed-off-by: Stefan Hajnoczi --- block/rbd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 1826411492..0f2608b287 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -478,20 +478,20 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags) } filename = qemu_opt_get(opts, "filename"); - qemu_opts_del(opts); if (qemu_rbd_parsename(filename, pool, sizeof(pool), snap_buf, sizeof(snap_buf), s->name, sizeof(s->name), conf, sizeof(conf)) < 0) { - return -EINVAL; + r = -EINVAL; + goto failed_opts; } clientname = qemu_rbd_parse_clientname(conf, clientname_buf); r = rados_create(&s->cluster, clientname); if (r < 0) { error_report("error initializing"); - return r; + goto failed_opts; } s->snap = NULL; @@ -557,6 +557,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags) NULL, qemu_rbd_aio_flush_cb, s); + qemu_opts_del(opts); return 0; failed: @@ -566,6 +567,8 @@ failed_open: failed_shutdown: rados_shutdown(s->cluster); g_free(s->snap); +failed_opts: + qemu_opts_del(opts); return r; } From 982dcbf4cbe80fa362c1edc37b2ced1cb8bcf37b Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Fri, 26 Apr 2013 01:19:51 +0900 Subject: [PATCH 07/11] sheepdog: cleanup find_vdi_name This makes 'filename' and 'tag' constant variables, and renames 'for_snapshot' to 'lock' to clear how it works. Signed-off-by: MORITA Kazutaka Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 9f30a87e32..4326664200 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -941,8 +941,9 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename, return ret; } -static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid, - char *tag, uint32_t *vid, int for_snapshot) +static int find_vdi_name(BDRVSheepdogState *s, const char *filename, + uint32_t snapid, const char *tag, uint32_t *vid, + bool lock) { int ret, fd; SheepdogVdiReq hdr; @@ -963,10 +964,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid, strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); memset(&hdr, 0, sizeof(hdr)); - if (for_snapshot) { - hdr.opcode = SD_OP_GET_VDI_INFO; - } else { + if (lock) { hdr.opcode = SD_OP_LOCK_VDI; + } else { + hdr.opcode = SD_OP_GET_VDI_INFO; } wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN; hdr.proto_ver = SD_PROTO_VER; @@ -1205,7 +1206,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) goto out; } - ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0); + ret = find_vdi_name(s, vdi, snapid, tag, &vid, true); if (ret) { goto out; } @@ -1921,7 +1922,7 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) pstrcpy(tag, sizeof(tag), s->name); } - ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1); + ret = find_vdi_name(s, vdi, snapid, tag, &vid, false); if (ret) { error_report("Failed to find_vdi_name"); goto out; From 6a0b5490338ed0fdf55c43062c88dd7638f05d6d Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Fri, 26 Apr 2013 01:19:52 +0900 Subject: [PATCH 08/11] sheepdog: add SD_RES_READONLY result code Sheepdog returns SD_RES_READONLY when qemu sends write requests to the snapshot vdi. This adds the result code and makes sd_strerror() print its error reason. Signed-off-by: MORITA Kazutaka Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/sheepdog.c b/block/sheepdog.c index 4326664200..f4e7204c7c 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -68,6 +68,7 @@ #define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */ #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ #define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */ +#define SD_RES_READONLY 0x1A /* Object is read-only */ /* * Object ID rules @@ -349,6 +350,7 @@ static const char * sd_strerror(int err) {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"}, {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"}, {SD_RES_HALT, "Sheepdog is stopped serving IO request"}, + {SD_RES_READONLY, "Object is read-only"}, }; for (i = 0; i < ARRAY_SIZE(errors); ++i) { From 9ff53a0eb89afacfa1ba56b009d40be942d3bd63 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Fri, 26 Apr 2013 01:19:53 +0900 Subject: [PATCH 09/11] sheepdog: add helper function to reload inode This adds a helper function to update the current inode state with the specified vdi object. Signed-off-by: MORITA Kazutaka Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 67 ++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index f4e7204c7c..0eaf4c3d8d 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -1150,6 +1150,42 @@ static int write_object(int fd, char *buf, uint64_t oid, int copies, create, cache_flags); } +/* update inode with the latest state */ +static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag) +{ + SheepdogInode *inode; + int ret = 0, fd; + uint32_t vid = 0; + + fd = connect_to_sdog(s); + if (fd < 0) { + return -EIO; + } + + inode = g_malloc(sizeof(s->inode)); + + ret = find_vdi_name(s, s->name, snapid, tag, &vid, false); + if (ret) { + goto out; + } + + ret = read_object(fd, (char *)inode, vid_to_vdi_oid(vid), + s->inode.nr_copies, sizeof(*inode), 0, s->cache_flags); + if (ret < 0) { + goto out; + } + + if (inode->vdi_id != s->inode.vdi_id) { + memcpy(&s->inode, inode, sizeof(s->inode)); + } + +out: + g_free(inode); + closesocket(fd); + + return ret; +} + /* TODO Convert to fine grained options */ static QemuOptsList runtime_opts = { .name = "sheepdog", @@ -1905,18 +1941,14 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) { BDRVSheepdogState *s = bs->opaque; BDRVSheepdogState *old_s; - char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; - char *buf = NULL; - uint32_t vid; + char tag[SD_MAX_VDI_TAG_LEN]; uint32_t snapid = 0; - int ret = 0, fd; + int ret = 0; old_s = g_malloc(sizeof(BDRVSheepdogState)); memcpy(old_s, s, sizeof(BDRVSheepdogState)); - pstrcpy(vdi, sizeof(vdi), s->name); - snapid = strtoul(snapshot_id, NULL, 10); if (snapid) { tag[0] = 0; @@ -1924,30 +1956,11 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) pstrcpy(tag, sizeof(tag), s->name); } - ret = find_vdi_name(s, vdi, snapid, tag, &vid, false); - if (ret) { - error_report("Failed to find_vdi_name"); - goto out; - } - - fd = connect_to_sdog(s); - if (fd < 0) { - ret = fd; - goto out; - } - - buf = g_malloc(SD_INODE_SIZE); - ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies, - SD_INODE_SIZE, 0, s->cache_flags); - - closesocket(fd); - + ret = reload_inode(s, snapid, tag); if (ret) { goto out; } - memcpy(&s->inode, buf, sizeof(s->inode)); - if (!s->inode.vm_state_size) { error_report("Invalid snapshot"); ret = -ENOENT; @@ -1956,14 +1969,12 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) s->is_snapshot = true; - g_free(buf); g_free(old_s); return 0; out: /* recover bdrv_sd_state */ memcpy(s, old_s, sizeof(BDRVSheepdogState)); - g_free(buf); g_free(old_s); error_report("failed to open. recover old bdrv_sd_state."); From 13c31de2fdd534c065ce4710f6e8df3921e98c4f Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Fri, 26 Apr 2013 01:19:54 +0900 Subject: [PATCH 10/11] sheepdog: resend write requests when SD_RES_READONLY is received When a snapshot is taken from out side of qemu (e.g. qemu-img snapshot), write requests to the current vdi return SD_RES_READONLY. In this case, the sheepdog block driver needs to update the current inode to the latest one and resend the write requests. Signed-off-by: MORITA Kazutaka Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 0eaf4c3d8d..77e21fdb7f 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -605,6 +605,7 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data, static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, struct iovec *iov, int niov, bool create, enum AIOCBState aiocb_type); +static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid) @@ -749,9 +750,19 @@ static void coroutine_fn aio_read_response(void *opaque) } } - if (rsp.result != SD_RES_SUCCESS) { + switch (rsp.result) { + case SD_RES_SUCCESS: + break; + case SD_RES_READONLY: + ret = resend_aioreq(s, aio_req); + if (ret == SD_RES_SUCCESS) { + goto out; + } + /* fall through */ + default: acb->ret = -EIO; error_report("%s", sd_strerror(rsp.result)); + break; } free_aio_req(s, aio_req); @@ -1186,6 +1197,53 @@ out: return ret; } +static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) +{ + SheepdogAIOCB *acb = aio_req->aiocb; + bool create = false; + int ret; + + ret = reload_inode(s, 0, ""); + if (ret < 0) { + return ret; + } + + aio_req->oid = vid_to_data_oid(s->inode.vdi_id, + data_oid_to_idx(aio_req->oid)); + + /* check whether this request becomes a CoW one */ + if (acb->aiocb_type == AIOCB_WRITE_UDATA) { + int idx = data_oid_to_idx(aio_req->oid); + AIOReq *areq; + + if (s->inode.data_vdi_id[idx] == 0) { + create = true; + goto out; + } + if (is_data_obj_writable(&s->inode, idx)) { + goto out; + } + + /* link to the pending list if there is another CoW request to + * the same object */ + QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { + if (areq != aio_req && areq->oid == aio_req->oid) { + dprintf("simultaneous CoW to %" PRIx64 "\n", aio_req->oid); + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings); + return SD_RES_SUCCESS; + } + } + + aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); + aio_req->flags |= SD_FLAG_CMD_COW; + create = true; + } +out: + return add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, + create, acb->aiocb_type); +} + /* TODO Convert to fine grained options */ static QemuOptsList runtime_opts = { .name = "sheepdog", From 859e5553a428225de6b8ef302cdcfd68d140b926 Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Thu, 25 Apr 2013 20:49:39 +0800 Subject: [PATCH 11/11] sheepdog: fix loadvm operation Currently the 'loadvm' opertaion works as following: 1. switch to the snapshot 2. mark current working VDI as a snapshot 3. rely on sd_create_branch to create a new working VDI based on the snapshot This works not the same as other format as QCOW2. For e.g, qemu > savevm # get a live snapshot snap1 qemu > savevm # snap2 qemu > loadvm 1 # This will steally create snap3 of the working VDI Which will result in following snapshot chain: base <-- snap1 <-- snap2 <-- snap3 ^ | working VDI snap3 was unnecessarily created and might be annoying users. This patch discard the unnecessary 'snap3' creation. and implement rollback(loadvm) operation to the specified snapshot by 1. switch to the snapshot 2. delete working VDI 3. rely on sd_create_branch to create a new working VDI based on the snapshot The snapshot chain for above example will be: base <-- snap1 <-- snap2 ^ | working VDI Cc: qemu-devel@nongnu.org Cc: MORITA Kazutaka Cc: Kevin Wolf Cc: Stefan Hajnoczi Signed-off-by: Liu Yuan Reviewed-by: MORITA Kazutaka Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 77e21fdb7f..21a4edf15b 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -36,6 +36,7 @@ #define SD_OP_GET_VDI_INFO 0x14 #define SD_OP_READ_VDIS 0x15 #define SD_OP_FLUSH_VDI 0x16 +#define SD_OP_DEL_VDI 0x17 #define SD_FLAG_CMD_WRITE 0x01 #define SD_FLAG_CMD_COW 0x02 @@ -1666,6 +1667,43 @@ out: sd_finish_aiocb(acb); } +/* Delete current working VDI on the snapshot chain */ +static bool sd_delete(BDRVSheepdogState *s) +{ + unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; + SheepdogVdiReq hdr = { + .opcode = SD_OP_DEL_VDI, + .vdi_id = s->inode.vdi_id, + .data_length = wlen, + .flags = SD_FLAG_CMD_WRITE, + }; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + int fd, ret; + + fd = connect_to_sdog(s); + if (fd < 0) { + return false; + } + + ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen); + closesocket(fd); + if (ret) { + return false; + } + switch (rsp->result) { + case SD_RES_NO_VDI: + error_report("%s was already deleted", s->name); + /* fall through */ + case SD_RES_SUCCESS: + break; + default: + error_report("%s, %s", sd_strerror(rsp->result), s->name); + return false; + } + + return true; +} + /* * Create a writable VDI from a snapshot */ @@ -1674,12 +1712,20 @@ static int sd_create_branch(BDRVSheepdogState *s) int ret, fd; uint32_t vid; char *buf; + bool deleted; dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id); buf = g_malloc(SD_INODE_SIZE); - ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1); + /* + * Even If deletion fails, we will just create extra snapshot based on + * the workding VDI which was supposed to be deleted. So no need to + * false bail out. + */ + deleted = sd_delete(s); + ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, + !deleted); if (ret) { goto out; } @@ -1995,6 +2041,12 @@ cleanup: return ret; } +/* + * We implement rollback(loadvm) operation to the specified snapshot by + * 1) switch to the snapshot + * 2) rely on sd_create_branch to delete working VDI and + * 3) create a new working VDI based on the speicified snapshot + */ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) { BDRVSheepdogState *s = bs->opaque;