From 45f1882a9e1a8421404ea47de6aadf3d945618d0 Mon Sep 17 00:00:00 2001 From: John Snow Date: Tue, 21 Nov 2017 19:16:56 -0500 Subject: [PATCH 01/10] iotests: fix 075 and 078 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both of these tests are for formats which now stipulate that they are read-only. Adjust the tests to match. Signed-off-by: John Snow Reviewed-by: Eric Blake Reviewed-by: Lukáš Doktor Signed-off-by: Kevin Wolf --- tests/qemu-iotests/075 | 18 +++++++++--------- tests/qemu-iotests/078 | 14 +++++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/qemu-iotests/075 b/tests/qemu-iotests/075 index 770d51c6cb..caa30d4743 100755 --- a/tests/qemu-iotests/075 +++ b/tests/qemu-iotests/075 @@ -48,56 +48,56 @@ offsets_offset=136 echo echo "== check that the first sector can be read ==" _use_sample_img simple-pattern.cloop.bz2 -$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== check that the last sector can be read ==" _use_sample_img simple-pattern.cloop.bz2 -$QEMU_IO -c "read $((1024 * 1024 - 512)) 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -r -c "read $((1024 * 1024 - 512)) 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== block_size must be a multiple of 512 ==" _use_sample_img simple-pattern.cloop.bz2 poke_file "$TEST_IMG" "$block_size_offset" "\x00\x00\x02\x01" -$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== block_size cannot be zero ==" _use_sample_img simple-pattern.cloop.bz2 poke_file "$TEST_IMG" "$block_size_offset" "\x00\x00\x00\x00" -$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== huge block_size ===" _use_sample_img simple-pattern.cloop.bz2 poke_file "$TEST_IMG" "$block_size_offset" "\xff\xff\xfe\x00" -$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== offsets_size overflow ===" _use_sample_img simple-pattern.cloop.bz2 poke_file "$TEST_IMG" "$n_blocks_offset" "\xff\xff\xff\xff" -$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== refuse images that require too many offsets ===" _use_sample_img simple-pattern.cloop.bz2 poke_file "$TEST_IMG" "$n_blocks_offset" "\x04\x00\x00\x01" -$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== refuse images with non-monotonically increasing offsets ==" _use_sample_img simple-pattern.cloop.bz2 poke_file "$TEST_IMG" "$offsets_offset" "\x00\x00\x00\x00\xff\xff\xff\xff" poke_file "$TEST_IMG" $((offsets_offset + 8)) "\x00\x00\x00\x00\xff\xfe\x00\x00" -$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== refuse images with invalid compressed block size ==" _use_sample_img simple-pattern.cloop.bz2 poke_file "$TEST_IMG" "$offsets_offset" "\x00\x00\x00\x00\x00\x00\x00\x00" poke_file "$TEST_IMG" $((offsets_offset + 8)) "\xff\xff\xff\xff\xff\xff\xff\xff" -$QEMU_IO -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -r -c "read 0 512" $TEST_IMG 2>&1 | _filter_qemu_io | _filter_testdir # success, all done echo "*** done" diff --git a/tests/qemu-iotests/078 b/tests/qemu-iotests/078 index f333e9ac84..a106c26f6b 100755 --- a/tests/qemu-iotests/078 +++ b/tests/qemu-iotests/078 @@ -48,41 +48,41 @@ disk_size_offset=$((0x58)) echo echo "== Read from a valid image ==" _use_sample_img empty.bochs.bz2 -{ $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir +{ $QEMU_IO -r -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== Negative catalog size ==" _use_sample_img empty.bochs.bz2 poke_file "$TEST_IMG" "$catalog_size_offset" "\xff\xff\xff\xff" -{ $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir +{ $QEMU_IO -r -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== Overflow for catalog size * sizeof(uint32_t) ==" _use_sample_img empty.bochs.bz2 poke_file "$TEST_IMG" "$catalog_size_offset" "\x00\x00\x00\x40" -{ $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir +{ $QEMU_IO -r -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== Too small catalog bitmap for image size ==" _use_sample_img empty.bochs.bz2 poke_file "$TEST_IMG" "$disk_size_offset" "\x00\xc0\x0f\x00\x00\x00\x00\x7f" -{ $QEMU_IO -c "read 2T 4k" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir +{ $QEMU_IO -r -c "read 2T 4k" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir _use_sample_img empty.bochs.bz2 poke_file "$TEST_IMG" "$catalog_size_offset" "\x10\x00\x00\x00" -{ $QEMU_IO -c "read 0xfbe00 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir +{ $QEMU_IO -r -c "read 0xfbe00 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== Negative extent size ==" _use_sample_img empty.bochs.bz2 poke_file "$TEST_IMG" "$extent_size_offset" "\x00\x00\x00\x80" -{ $QEMU_IO -c "read 768k 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir +{ $QEMU_IO -r -c "read 768k 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir echo echo "== Zero extent size ==" _use_sample_img empty.bochs.bz2 poke_file "$TEST_IMG" "$extent_size_offset" "\x00\x00\x00\x00" -{ $QEMU_IO -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir +{ $QEMU_IO -r -c "read 0 512" $TEST_IMG; } 2>&1 | _filter_qemu_io | _filter_testdir # success, all done echo "*** done" From b1d1cb272882dd6868740155120f6aeb260a204c Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 24 Nov 2017 16:53:50 +0800 Subject: [PATCH 02/10] docs: Add image locking subsection This documents the image locking feature and explains when and how related options can be used. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- docs/qemu-block-drivers.texi | 36 ++++++++++++++++++++++++++++++++++++ qemu-doc.texi | 1 + 2 files changed, 37 insertions(+) diff --git a/docs/qemu-block-drivers.texi b/docs/qemu-block-drivers.texi index 1cb1e55686..503c1847aa 100644 --- a/docs/qemu-block-drivers.texi +++ b/docs/qemu-block-drivers.texi @@ -785,6 +785,42 @@ warning: ssh server @code{ssh.example.com:22} does not support fsync With sufficiently new versions of libssh2 and OpenSSH, @code{fsync} is supported. +@node disk_image_locking +@subsection Disk image file locking + +By default, QEMU tries to protect image files from unexpected concurrent +access, as long as it's supported by the block protocol driver and host +operating system. If multiple QEMU processes (including QEMU emulators and +utilities) try to open the same image with conflicting accessing modes, all but +the first one will get an error. + +This feature is currently supported by the file protocol on Linux with the Open +File Descriptor (OFD) locking API, and can be configured to fall back to POSIX +locking if the POSIX host doesn't support Linux OFD locking. + +To explicitly enable image locking, specify "locking=on" in the file protocol +driver options. If OFD locking is not possible, a warning will be printed and +the POSIX locking API will be used. In this case there is a risk that the lock +will get silently lost when doing hot plugging and block jobs, due to the +shortcomings of the POSIX locking API. + +QEMU transparently handles lock handover during shared storage migration. For +shared virtual disk images between multiple VMs, the "share-rw" device option +should be used. + +Alternatively, locking can be fully disabled by "locking=off" block device +option. In the command line, the option is usually in the form of +"file.locking=off" as the protocol driver is normally placed as a "file" child +under a format driver. For example: + +@code{-blockdev driver=qcow2,file.filename=/path/to/image,file.locking=off,file.driver=file} + +To check if image locking is active, check the output of the "lslocks" command +on host and see if there are locks held by the QEMU process on the image file. +More than one byte could be locked by the QEMU instance, each byte of which +reflects a particular permission that is acquired or protected by the running +block driver. + @c man end @ignore diff --git a/qemu-doc.texi b/qemu-doc.texi index 617254917d..db2351c746 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -405,6 +405,7 @@ encrypted disk images. * disk_images_iscsi:: iSCSI LUNs * disk_images_gluster:: GlusterFS disk images * disk_images_ssh:: Secure Shell (ssh) disk images +* disk_image_locking:: Disk image file locking @end menu @node disk_images_quickstart From 1878eaff9bdeece4546d4f9587e6c75ab0b79b8b Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 24 Nov 2017 16:53:51 +0800 Subject: [PATCH 03/10] qemu-options: Mention locking option of file driver Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- qemu-options.hx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/qemu-options.hx b/qemu-options.hx index 3728e9b4dd..f11c4ac960 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -693,6 +693,10 @@ This is the protocol-level block driver for accessing regular files. The path to the image file in the local filesystem @item aio Specifies the AIO backend (threads/native, default: threads) +@item locking +Specifies whether the image file is protected with Linux OFD / POSIX locks. The +default is to use the Linux Open File Descriptor API if available, otherwise no +lock is applied. (auto/on/off, default: auto) @end table Example: @example From c117bb14ff633848cc6b0ff77f081f583dfa8c5e Mon Sep 17 00:00:00 2001 From: Kashyap Chamarthy Date: Tue, 21 Nov 2017 12:52:53 +0100 Subject: [PATCH 04/10] QAPI & interop: Clarify events emitted by 'block-job-cancel' When you cancel an in-progress 'mirror' job (or "active `block-commit`") with QMP `block-job-cancel`, it emits the event: BLOCK_JOB_CANCELLED. However, when `block-job-cancel` is issued *after* `drive-mirror` has indicated (via the event BLOCK_JOB_READY) that the source and destination have reached synchronization: [...] # Snip `drive-mirror` invocation & outputs { "execute":"block-job-cancel", "arguments":{ "device":"virtio0" } } {"return": {}} It (`block-job-cancel`) will counterintuitively emit the event 'BLOCK_JOB_COMPLETED': { "timestamp":{ "seconds":1510678024, "microseconds":526240 }, "event":"BLOCK_JOB_COMPLETED", "data":{ "device":"virtio0", "len":41126400, "offset":41126400, "speed":0, "type":"mirror" } } But this is expected behaviour, where the _COMPLETED event indicates that synchronization has successfully ended (and the destination now has a point-in-time copy, which is at the time of cancel). So add a small note to this effect in 'block-core.json'. While at it, also update the "Live disk synchronization -- drive-mirror and blockdev-mirror" section in 'live-block-operations.rst'. (Thanks: Max Reitz for reminding me of this caveat on IRC.) Signed-off-by: Kashyap Chamarthy Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- docs/interop/live-block-operations.rst | 46 +++++++++++++++++--------- qapi/block-core.json | 6 ++++ 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/docs/interop/live-block-operations.rst b/docs/interop/live-block-operations.rst index 5f0179749f..734252bc80 100644 --- a/docs/interop/live-block-operations.rst +++ b/docs/interop/live-block-operations.rst @@ -506,26 +506,40 @@ Again, given our familiar disk image chain:: [A] <-- [B] <-- [C] <-- [D] -The ``drive-mirror`` (and its newer equivalent ``blockdev-mirror``) allows -you to copy data from the entire chain into a single target image (which -can be located on a different host). +The ``drive-mirror`` (and its newer equivalent ``blockdev-mirror``) +allows you to copy data from the entire chain into a single target image +(which can be located on a different host), [E]. -Once a 'mirror' job has started, there are two possible actions while a -``drive-mirror`` job is active: +.. note:: -(1) Issuing the command ``block-job-cancel`` after it emits the event - ``BLOCK_JOB_CANCELLED``: will (after completing synchronization of - the content from the disk image chain to the target image, [E]) - create a point-in-time (which is at the time of *triggering* the - cancel command) copy, contained in image [E], of the the entire disk + When you cancel an in-progress 'mirror' job *before* the source and + target are synchronized, ``block-job-cancel`` will emit the event + ``BLOCK_JOB_CANCELLED``. However, note that if you cancel a + 'mirror' job *after* it has indicated (via the event + ``BLOCK_JOB_READY``) that the source and target have reached + synchronization, then the event emitted by ``block-job-cancel`` + changes to ``BLOCK_JOB_COMPLETED``. + + Besides the 'mirror' job, the "active ``block-commit``" is the only + other block device job that emits the event ``BLOCK_JOB_READY``. + The rest of the block device jobs ('stream', "non-active + ``block-commit``", and 'backup') end automatically. + +So there are two possible actions to take, after a 'mirror' job has +emitted the event ``BLOCK_JOB_READY``, indicating that the source and +target have reached synchronization: + +(1) Issuing the command ``block-job-cancel`` (after it emits the event + ``BLOCK_JOB_COMPLETED``) will create a point-in-time (which is at + the time of *triggering* the cancel command) copy of the entire disk image chain (or only the top-most image, depending on the ``sync`` - mode). + mode), contained in the target image [E]. One use case for this is + live VM migration with non-shared storage. -(2) Issuing the command ``block-job-complete`` after it emits the event - ``BLOCK_JOB_COMPLETED``: will, after completing synchronization of - the content, adjust the guest device (i.e. live QEMU) to point to - the target image, and, causing all the new writes from this point on - to happen there. One use case for this is live storage migration. +(2) Issuing the command ``block-job-complete`` (after it emits the event + ``BLOCK_JOB_COMPLETED``) will adjust the guest device (i.e. live + QEMU) to point to the target image, [E], causing all the new writes + from this point on to happen there. About synchronization modes: The synchronization mode determines *which* part of the disk image chain will be copied to the target. diff --git a/qapi/block-core.json b/qapi/block-core.json index 76bf50f813..dd763dcf87 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2065,6 +2065,12 @@ # BLOCK_JOB_CANCELLED event. Before that happens the job is still visible when # enumerated using query-block-jobs. # +# Note that if you issue 'block-job-cancel' after 'drive-mirror' has indicated +# (via the event BLOCK_JOB_READY) that the source and destination are +# synchronized, then the event triggered by this command changes to +# BLOCK_JOB_COMPLETED, to indicate that the mirroring has ended and the +# destination now has a point-in-time copy tied to the time of the cancellation. +# # For streaming, the image file retains its backing file unless the streaming # operation happens to complete just as it is being cancelled. A new streaming # operation can be started at a later time to finish copying all data from the From 0a3e155f3f5ec9b6f12d00894c7701b3cbb66590 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Tue, 28 Nov 2017 16:53:27 +0200 Subject: [PATCH 05/10] blockjob: Remove the job from the list earlier in block_job_unref() When destroying a block job in block_job_unref() we should remove it from the job list before calling block_job_remove_all_bdrv(). This is because removing the BDSs can trigger an aio_poll() and wake up other jobs that might attempt to use the block job list. If that happens the job we're currently destroying should not be in that list anymore. Signed-off-by: Alberto Garcia Signed-off-by: Kevin Wolf --- blockjob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blockjob.c b/blockjob.c index ff9a614531..2f0cc1528b 100644 --- a/blockjob.c +++ b/blockjob.c @@ -152,6 +152,7 @@ void block_job_unref(BlockJob *job) { if (--job->refcnt == 0) { BlockDriverState *bs = blk_bs(job->blk); + QLIST_REMOVE(job, job_list); bs->job = NULL; block_job_remove_all_bdrv(job); blk_remove_aio_context_notifier(job->blk, @@ -160,7 +161,6 @@ void block_job_unref(BlockJob *job) blk_unref(job->blk); error_free(job->blocker); g_free(job->id); - QLIST_REMOVE(job, job_list); g_free(job); } } From 02d213009d571bcd7171e3ff9234722a11d30d1b Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 29 Nov 2017 11:25:10 +0100 Subject: [PATCH 06/10] block: Expect graph changes in bdrv_parent_drained_begin/end The .drained_begin/end callbacks can (directly or indirectly via aio_poll()) cause block nodes to be removed or the current BdrvChild to point to a different child node. Use QLIST_FOREACH_SAFE() to make sure we don't access invalid BlockDriverStates or accidentally continue iterating the parents of the new child node instead of the node we actually came from. Signed-off-by: Kevin Wolf Tested-by: Jeff Cody Reviewed-by: Stefan Hajnoczi Reviewed-by: Jeff Cody Reviewed-by: Alberto Garcia Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/io.c b/block/io.c index 4fdf93a014..6773926fc1 100644 --- a/block/io.c +++ b/block/io.c @@ -42,9 +42,9 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, void bdrv_parent_drained_begin(BlockDriverState *bs) { - BdrvChild *c; + BdrvChild *c, *next; - QLIST_FOREACH(c, &bs->parents, next_parent) { + QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { if (c->role->drained_begin) { c->role->drained_begin(c); } @@ -53,9 +53,9 @@ void bdrv_parent_drained_begin(BlockDriverState *bs) void bdrv_parent_drained_end(BlockDriverState *bs) { - BdrvChild *c; + BdrvChild *c, *next; - QLIST_FOREACH(c, &bs->parents, next_parent) { + QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { if (c->role->drained_end) { c->role->drained_end(c); } From 5bf1d5a73a4a6d0e2d692bd02b6d7f3eedeed3b7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Nov 2017 11:25:11 +0100 Subject: [PATCH 07/10] blockjob: remove clock argument from block_job_sleep_ns All callers are using QEMU_CLOCK_REALTIME, and it will not be possible to support more than one clock when block_job_sleep_ns switches to a single timer stored in the BlockJob struct. Signed-off-by: Paolo Bonzini Reviewed-by: Alberto Garcia Tested-By: Jeff Cody Reviewed-by: Fam Zheng Reviewed-by: Jeff Cody Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/backup.c | 4 ++-- block/commit.c | 2 +- block/mirror.c | 6 +++--- block/stream.c | 2 +- blockjob.c | 5 +++-- include/block/blockjob_int.h | 7 +++---- tests/test-blockjob-txn.c | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/block/backup.c b/block/backup.c index 06ddbfd03d..99e6bcc748 100644 --- a/block/backup.c +++ b/block/backup.c @@ -346,9 +346,9 @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job) uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, job->bytes_read); job->bytes_read = 0; - block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); + block_job_sleep_ns(&job->common, delay_ns); } else { - block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); + block_job_sleep_ns(&job->common, 0); } if (block_job_is_cancelled(&job->common)) { diff --git a/block/commit.c b/block/commit.c index 5036eec434..c5327551ce 100644 --- a/block/commit.c +++ b/block/commit.c @@ -174,7 +174,7 @@ static void coroutine_fn commit_run(void *opaque) /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. */ - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); + block_job_sleep_ns(&s->common, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } diff --git a/block/mirror.c b/block/mirror.c index 307b6391a8..c9badc1203 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -598,7 +598,7 @@ static void mirror_throttle(MirrorBlockJob *s) if (now - s->last_pause_ns > SLICE_TIME) { s->last_pause_ns = now; - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0); + block_job_sleep_ns(&s->common, 0); } else { block_job_pause_point(&s->common); } @@ -870,13 +870,13 @@ static void coroutine_fn mirror_run(void *opaque) ret = 0; trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); if (!s->synced) { - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); + block_job_sleep_ns(&s->common, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } } else if (!should_complete) { delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); + block_job_sleep_ns(&s->common, delay_ns); } s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); } diff --git a/block/stream.c b/block/stream.c index e6f72346e5..499cdacdb0 100644 --- a/block/stream.c +++ b/block/stream.c @@ -141,7 +141,7 @@ static void coroutine_fn stream_run(void *opaque) /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. */ - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); + block_job_sleep_ns(&s->common, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } diff --git a/blockjob.c b/blockjob.c index 2f0cc1528b..db9e4fc89a 100644 --- a/blockjob.c +++ b/blockjob.c @@ -788,7 +788,7 @@ bool block_job_is_cancelled(BlockJob *job) return job->cancelled; } -void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns) +void block_job_sleep_ns(BlockJob *job, int64_t ns) { assert(job->busy); @@ -803,7 +803,8 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns) * it wakes and runs, otherwise we risk double-entry or entry after * completion. */ if (!block_job_should_pause(job)) { - co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns); + co_aio_sleep_ns(blk_get_aio_context(job->blk), + QEMU_CLOCK_REALTIME, ns); } block_job_pause_point(job); diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h index 43f3be2965..f7ab183a39 100644 --- a/include/block/blockjob_int.h +++ b/include/block/blockjob_int.h @@ -139,14 +139,13 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, /** * block_job_sleep_ns: * @job: The job that calls the function. - * @clock: The clock to sleep on. * @ns: How many nanoseconds to stop for. * * Put the job to sleep (assuming that it wasn't canceled) for @ns - * nanoseconds. Canceling the job will not interrupt the wait, so the - * cancel will not process until the coroutine wakes up. + * %QEMU_CLOCK_REALTIME nanoseconds. Canceling the job will not interrupt + * the wait, so the cancel will not process until the coroutine wakes up. */ -void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns); +void block_job_sleep_ns(BlockJob *job, int64_t ns); /** * block_job_yield: diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c index c77343fc04..3591c9617f 100644 --- a/tests/test-blockjob-txn.c +++ b/tests/test-blockjob-txn.c @@ -44,7 +44,7 @@ static void coroutine_fn test_block_job_run(void *opaque) while (s->iterations--) { if (s->use_timer) { - block_job_sleep_ns(job, QEMU_CLOCK_REALTIME, 0); + block_job_sleep_ns(job, 0); } else { block_job_yield(job); } From 356f59b8757f47c0aca3e2e4e51d6010f64cade1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Nov 2017 11:25:12 +0100 Subject: [PATCH 08/10] blockjob: introduce block_job_do_yield Hide the clearing of job->busy in a single function, and set it in block_job_enter. This lets block_job_do_yield verify that qemu_coroutine_enter is not used while job->busy = false. Signed-off-by: Paolo Bonzini Tested-By: Jeff Cody Reviewed-by: Fam Zheng Reviewed-by: Jeff Cody Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- blockjob.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/blockjob.c b/blockjob.c index db9e4fc89a..4d22b7d2fb 100644 --- a/blockjob.c +++ b/blockjob.c @@ -729,6 +729,15 @@ static bool block_job_should_pause(BlockJob *job) return job->pause_count > 0; } +static void block_job_do_yield(BlockJob *job) +{ + job->busy = false; + qemu_coroutine_yield(); + + /* Set by block_job_enter before re-entering the coroutine. */ + assert(job->busy); +} + void coroutine_fn block_job_pause_point(BlockJob *job) { assert(job && block_job_started(job)); @@ -746,9 +755,7 @@ void coroutine_fn block_job_pause_point(BlockJob *job) if (block_job_should_pause(job) && !block_job_is_cancelled(job)) { job->paused = true; - job->busy = false; - qemu_coroutine_yield(); /* wait for block_job_resume() */ - job->busy = true; + block_job_do_yield(job); job->paused = false; } @@ -778,9 +785,12 @@ void block_job_enter(BlockJob *job) return; } - if (!job->busy) { - bdrv_coroutine_enter(blk_bs(job->blk), job->co); + if (job->busy) { + return; } + + job->busy = true; + aio_co_wake(job->co); } bool block_job_is_cancelled(BlockJob *job) @@ -819,11 +829,9 @@ void block_job_yield(BlockJob *job) return; } - job->busy = false; if (!block_job_should_pause(job)) { - qemu_coroutine_yield(); + block_job_do_yield(job); } - job->busy = true; block_job_pause_point(job); } From fc24908e7d232b79c2a6f0363f52bda18dedb57b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 29 Nov 2017 11:25:13 +0100 Subject: [PATCH 09/10] blockjob: reimplement block_job_sleep_ns to allow cancellation This reverts the effects of commit 4afeffc857 ("blockjob: do not allow coroutine double entry or entry-after-completion", 2017-11-21) This fixed the symptom of a bug rather than the root cause. Canceling the wait on a sleeping blockjob coroutine is generally fine, we just need to make it work correctly across AioContexts. To do so, use a QEMUTimer that calls block_job_enter. Use a mutex to ensure that block_job_enter synchronizes correctly with block_job_sleep_ns. Signed-off-by: Paolo Bonzini Tested-By: Jeff Cody Reviewed-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Reviewed-by: Jeff Cody Signed-off-by: Kevin Wolf --- blockjob.c | 63 +++++++++++++++++++++++++++++------- include/block/blockjob.h | 8 ++++- include/block/blockjob_int.h | 4 +-- 3 files changed, 61 insertions(+), 14 deletions(-) diff --git a/blockjob.c b/blockjob.c index 4d22b7d2fb..0ed50b953b 100644 --- a/blockjob.c +++ b/blockjob.c @@ -37,6 +37,26 @@ #include "qemu/timer.h" #include "qapi-event.h" +/* Right now, this mutex is only needed to synchronize accesses to job->busy + * and job->sleep_timer, such as concurrent calls to block_job_do_yield and + * block_job_enter. */ +static QemuMutex block_job_mutex; + +static void block_job_lock(void) +{ + qemu_mutex_lock(&block_job_mutex); +} + +static void block_job_unlock(void) +{ + qemu_mutex_unlock(&block_job_mutex); +} + +static void __attribute__((__constructor__)) block_job_init(void) +{ + qemu_mutex_init(&block_job_mutex); +} + static void block_job_event_cancelled(BlockJob *job); static void block_job_event_completed(BlockJob *job, const char *msg); @@ -161,6 +181,7 @@ void block_job_unref(BlockJob *job) blk_unref(job->blk); error_free(job->blocker); g_free(job->id); + assert(!timer_pending(&job->sleep_timer)); g_free(job); } } @@ -287,6 +308,13 @@ static void coroutine_fn block_job_co_entry(void *opaque) job->driver->start(job); } +static void block_job_sleep_timer_cb(void *opaque) +{ + BlockJob *job = opaque; + + block_job_enter(job); +} + void block_job_start(BlockJob *job) { assert(job && !block_job_started(job) && job->paused && @@ -556,7 +584,7 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp) info->type = g_strdup(BlockJobType_str(job->driver->job_type)); info->device = g_strdup(job->id); info->len = job->len; - info->busy = job->busy; + info->busy = atomic_read(&job->busy); info->paused = job->pause_count > 0; info->offset = job->offset; info->speed = job->speed; @@ -664,6 +692,9 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, job->paused = true; job->pause_count = 1; job->refcnt = 1; + aio_timer_init(qemu_get_aio_context(), &job->sleep_timer, + QEMU_CLOCK_REALTIME, SCALE_NS, + block_job_sleep_timer_cb, job); error_setg(&job->blocker, "block device is in use by block job: %s", BlockJobType_str(driver->job_type)); @@ -729,9 +760,20 @@ static bool block_job_should_pause(BlockJob *job) return job->pause_count > 0; } -static void block_job_do_yield(BlockJob *job) +/* Yield, and schedule a timer to reenter the coroutine after @ns nanoseconds. + * Reentering the job coroutine with block_job_enter() before the timer has + * expired is allowed and cancels the timer. + * + * If @ns is (uint64_t) -1, no timer is scheduled and block_job_enter() must be + * called explicitly. */ +static void block_job_do_yield(BlockJob *job, uint64_t ns) { + block_job_lock(); + if (ns != -1) { + timer_mod(&job->sleep_timer, ns); + } job->busy = false; + block_job_unlock(); qemu_coroutine_yield(); /* Set by block_job_enter before re-entering the coroutine. */ @@ -755,7 +797,7 @@ void coroutine_fn block_job_pause_point(BlockJob *job) if (block_job_should_pause(job) && !block_job_is_cancelled(job)) { job->paused = true; - block_job_do_yield(job); + block_job_do_yield(job, -1); job->paused = false; } @@ -785,11 +827,16 @@ void block_job_enter(BlockJob *job) return; } + block_job_lock(); if (job->busy) { + block_job_unlock(); return; } + assert(!job->deferred_to_main_loop); + timer_del(&job->sleep_timer); job->busy = true; + block_job_unlock(); aio_co_wake(job->co); } @@ -807,14 +854,8 @@ void block_job_sleep_ns(BlockJob *job, int64_t ns) return; } - /* We need to leave job->busy set here, because when we have - * put a coroutine to 'sleep', we have scheduled it to run in - * the future. We cannot enter that same coroutine again before - * it wakes and runs, otherwise we risk double-entry or entry after - * completion. */ if (!block_job_should_pause(job)) { - co_aio_sleep_ns(blk_get_aio_context(job->blk), - QEMU_CLOCK_REALTIME, ns); + block_job_do_yield(job, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + ns); } block_job_pause_point(job); @@ -830,7 +871,7 @@ void block_job_yield(BlockJob *job) } if (!block_job_should_pause(job)) { - block_job_do_yield(job); + block_job_do_yield(job, -1); } block_job_pause_point(job); diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 67c0968fa5..00403d9482 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -77,7 +77,7 @@ typedef struct BlockJob { /** * Set to false by the job while the coroutine has yielded and may be * re-entered by block_job_enter(). There may still be I/O or event loop - * activity pending. + * activity pending. Accessed under block_job_mutex (in blockjob.c). */ bool busy; @@ -135,6 +135,12 @@ typedef struct BlockJob { */ int ret; + /** + * Timer that is used by @block_job_sleep_ns. Accessed under + * block_job_mutex (in blockjob.c). + */ + QEMUTimer sleep_timer; + /** Non-NULL if this job is part of a transaction */ BlockJobTxn *txn; QLIST_ENTRY(BlockJob) txn_list; diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h index f7ab183a39..c9b23b0cc9 100644 --- a/include/block/blockjob_int.h +++ b/include/block/blockjob_int.h @@ -142,8 +142,8 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, * @ns: How many nanoseconds to stop for. * * Put the job to sleep (assuming that it wasn't canceled) for @ns - * %QEMU_CLOCK_REALTIME nanoseconds. Canceling the job will not interrupt - * the wait, so the cancel will not process until the coroutine wakes up. + * %QEMU_CLOCK_REALTIME nanoseconds. Canceling the job will immediately + * interrupt the wait. */ void block_job_sleep_ns(BlockJob *job, int64_t ns); From f1a7ff770f7d71ee7833ff019aac9d6cc3d13f71 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 27 Nov 2017 17:00:07 +0100 Subject: [PATCH 10/10] block/nfs: fix nfs_client_open for filesize greater than 1TB DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE) was overflowing ret (int) if st.st_size is greater than 1TB. Cc: qemu-stable@nongnu.org Signed-off-by: Peter Lieven Message-id: 1511798407-31129-1-git-send-email-pl@kamp.de Signed-off-by: Max Reitz --- block/nfs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/block/nfs.c b/block/nfs.c index 337fcd9c84..effc8719b5 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -1,7 +1,7 @@ /* * QEMU Block driver for native access to files on NFS shares * - * Copyright (c) 2014-2016 Peter Lieven + * Copyright (c) 2014-2017 Peter Lieven * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -496,7 +496,7 @@ out: static int64_t nfs_client_open(NFSClient *client, QDict *options, int flags, int open_flags, Error **errp) { - int ret = -EINVAL; + int64_t ret = -EINVAL; QemuOpts *opts = NULL; Error *local_err = NULL; struct stat st; @@ -686,8 +686,7 @@ static QemuOptsList nfs_create_opts = { static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp) { - int ret = 0; - int64_t total_size = 0; + int64_t ret, total_size; NFSClient *client = g_new0(NFSClient, 1); QDict *options = NULL;