From 890cbccb089db9e646cc1baea3be9dc060e3917b Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Wed, 22 Jul 2020 16:22:31 -0500 Subject: [PATCH 1/9] nbd: Fix large trim/zero requests Although qemu as NBD client limits requests to <2G, the NBD protocol allows clients to send requests almost all the way up to 4G. But because our block layer is not yet 64-bit clean, we accidentally wrap such requests into a negative size, and fail with EIO instead of performing the intended operation. The bug is visible in modern systems with something as simple as: $ qemu-img create -f qcow2 /tmp/image.img 5G $ sudo qemu-nbd --connect=/dev/nbd0 /tmp/image.img $ sudo blkdiscard /dev/nbd0 or with user-space only: $ truncate --size=3G file $ qemu-nbd -f raw file $ nbdsh -u nbd://localhost:10809 -c 'h.trim(3*1024*1024*1024,0)' Although both blk_co_pdiscard and blk_pwrite_zeroes currently return 0 on success, this is also a good time to fix our code to a more robust paradigm that treats all non-negative values as success. Alas, our iotests do not currently make it easy to add external dependencies on blkdiscard or nbdsh, so we have to rely on manual testing for now. This patch can be reverted when we later improve the overall block layer to be 64-bit clean, but for now, a minimal fix was deemed less risky prior to release. CC: qemu-stable@nongnu.org Fixes: 1f4d6d18ed Fixes: 1c6c4bb7f0 Fixes: https://github.com/systemd/systemd/issues/16242 Signed-off-by: Eric Blake Message-Id: <20200722212231.535072-1-eblake@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy [eblake: rework success tests to use >=0] --- nbd/server.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index 4752a6c8bc..c5d71cff10 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -2378,8 +2378,17 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, if (request->flags & NBD_CMD_FLAG_FAST_ZERO) { flags |= BDRV_REQ_NO_FALLBACK; } - ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, - request->len, flags); + ret = 0; + /* FIXME simplify this when blk_pwrite_zeroes switches to 64-bit */ + while (ret >= 0 && request->len) { + int align = client->check_align ?: 1; + int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES, + align)); + ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, + len, flags); + request->len -= len; + request->from += len; + } return nbd_send_generic_reply(client, request->handle, ret, "writing to file failed", errp); @@ -2393,9 +2402,18 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, "flush failed", errp); case NBD_CMD_TRIM: - ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, - request->len); - if (ret == 0 && request->flags & NBD_CMD_FLAG_FUA) { + ret = 0; + /* FIXME simplify this when blk_co_pdiscard switches to 64-bit */ + while (ret >= 0 && request->len) { + int align = client->check_align ?: 1; + int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES, + align)); + ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, + len); + request->len -= len; + request->from += len; + } + if (ret >= 0 && request->flags & NBD_CMD_FLAG_FUA) { ret = blk_co_flush(exp->blk); } return nbd_send_generic_reply(client, request->handle, ret, From a2b333c01880f56056d50c238834d62e32001e54 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Tue, 28 Jul 2020 00:58:43 +0300 Subject: [PATCH 2/9] block: nbd: Fix convert qcow2 compressed to nbd When converting to qcow2 compressed format, the last step is a special zero length compressed write, ending in a call to bdrv_co_truncate(). This call always fails for the nbd driver since it does not implement bdrv_co_truncate(). For block devices, which have the same limits, the call succeeds since the file driver implements bdrv_co_truncate(). If the caller asked to truncate to the same or smaller size with exact=false, the truncate succeeds. Implement the same logic for nbd. Example failing without this change: In one shell start qemu-nbd: $ truncate -s 1g test.tar $ qemu-nbd --socket=/tmp/nbd.sock --persistent --format=raw --offset 1536 test.tar In another shell convert an image to qcow2 compressed via NBD: $ echo "disk data" > disk.raw $ truncate -s 1g disk.raw $ qemu-img convert -f raw -O qcow2 -c disk1.raw nbd+unix:///?socket=/tmp/nbd.sock; echo $? 1 qemu-img failed, but the conversion was successful: $ qemu-img info nbd+unix:///?socket=/tmp/nbd.sock image: nbd+unix://?socket=/tmp/nbd.sock file format: qcow2 virtual size: 1 GiB (1073741824 bytes) ... $ qemu-img check nbd+unix:///?socket=/tmp/nbd.sock No errors were found on the image. 1/16384 = 0.01% allocated, 100.00% fragmented, 100.00% compressed clusters Image end offset: 393216 $ qemu-img compare disk.raw nbd+unix:///?socket=/tmp/nbd.sock Images are identical. Fixes: https://bugzilla.redhat.com/1860627 Signed-off-by: Nir Soffer Message-Id: <20200727215846.395443-2-nsoffer@redhat.com> Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy [eblake: typo fixes] Signed-off-by: Eric Blake --- block/nbd.c | 30 ++++++++++++++++++++++++++++++ qemu-io-cmds.c | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/block/nbd.c b/block/nbd.c index 65a4f56924..8934bcb479 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -1966,6 +1966,33 @@ static void nbd_close(BlockDriverState *bs) nbd_clear_bdrvstate(s); } +/* + * NBD cannot truncate, but if the caller asks to truncate to the same size, or + * to a smaller size with exact=false, there is no reason to fail the + * operation. + * + * Preallocation mode is ignored since it does not seems useful to fail when + * we never change anything. + */ +static int coroutine_fn nbd_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp) +{ + BDRVNBDState *s = bs->opaque; + + if (offset != s->info.size && exact) { + error_setg(errp, "Cannot resize NBD nodes"); + return -ENOTSUP; + } + + if (offset > s->info.size) { + error_setg(errp, "Cannot grow NBD nodes"); + return -EINVAL; + } + + return 0; +} + static int64_t nbd_getlength(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; @@ -2045,6 +2072,7 @@ static BlockDriver bdrv_nbd = { .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_pdiscard = nbd_client_co_pdiscard, .bdrv_refresh_limits = nbd_refresh_limits, + .bdrv_co_truncate = nbd_co_truncate, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_client_detach_aio_context, .bdrv_attach_aio_context = nbd_client_attach_aio_context, @@ -2072,6 +2100,7 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_pdiscard = nbd_client_co_pdiscard, .bdrv_refresh_limits = nbd_refresh_limits, + .bdrv_co_truncate = nbd_co_truncate, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_client_detach_aio_context, .bdrv_attach_aio_context = nbd_client_attach_aio_context, @@ -2099,6 +2128,7 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_pdiscard = nbd_client_co_pdiscard, .bdrv_refresh_limits = nbd_refresh_limits, + .bdrv_co_truncate = nbd_co_truncate, .bdrv_getlength = nbd_getlength, .bdrv_detach_aio_context = nbd_client_detach_aio_context, .bdrv_attach_aio_context = nbd_client_attach_aio_context, diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 851f07e8f8..baeae86d8c 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -1715,7 +1715,7 @@ static int truncate_f(BlockBackend *blk, int argc, char **argv) * exact=true. It is better to err on the "emit more errors" side * than to be overly permissive. */ - ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, 0, &local_err); + ret = blk_truncate(blk, offset, false, PREALLOC_MODE_OFF, 0, &local_err); if (ret < 0) { error_report_err(local_err); return ret; From b7719bcad2e92bab5aae3166fa9011f127e6ee49 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Tue, 28 Jul 2020 00:58:44 +0300 Subject: [PATCH 3/9] iotests: Make qemu_nbd_popen() a contextmanager Instead of duplicating the code to wait until the server is ready and remember to terminate the server and wait for it, make it possible to use like this: with qemu_nbd_popen('-k', sock, image): # Access image via qemu-nbd socket... Only test 264 used this helper, but I had to modify the output since it did not consistently when starting and stopping qemu-nbd. Signed-off-by: Nir Soffer Message-Id: <20200727215846.395443-3-nsoffer@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Eric Blake --- tests/qemu-iotests/264 | 72 +++++++++++++---------------------- tests/qemu-iotests/264.out | 2 + tests/qemu-iotests/iotests.py | 28 +++++++++++++- 3 files changed, 54 insertions(+), 48 deletions(-) diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264 index 304a7443d7..666f164ed8 100755 --- a/tests/qemu-iotests/264 +++ b/tests/qemu-iotests/264 @@ -36,48 +36,32 @@ wait_step = 0.2 qemu_img_create('-f', iotests.imgfmt, disk_a, str(size)) qemu_img_create('-f', iotests.imgfmt, disk_b, str(size)) -srv = qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b) -# Wait for NBD server availability -t = 0 -ok = False -while t < wait_limit: - ok = qemu_io_silent_check('-f', 'raw', '-c', 'read 0 512', nbd_uri) - if ok: - break - time.sleep(wait_step) - t += wait_step +with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b): + vm = iotests.VM().add_drive(disk_a) + vm.launch() + vm.hmp_qemu_io('drive0', 'write 0 {}'.format(size)) -assert ok + vm.qmp_log('blockdev-add', filters=[iotests.filter_qmp_testfiles], + **{'node_name': 'backup0', + 'driver': 'raw', + 'file': {'driver': 'nbd', + 'server': {'type': 'unix', 'path': nbd_sock}, + 'reconnect-delay': 10}}) + vm.qmp_log('blockdev-backup', device='drive0', sync='full', target='backup0', + speed=(1 * 1024 * 1024)) -vm = iotests.VM().add_drive(disk_a) -vm.launch() -vm.hmp_qemu_io('drive0', 'write 0 {}'.format(size)) + # Wait for some progress + t = 0 + while t < wait_limit: + jobs = vm.qmp('query-block-jobs')['return'] + if jobs and jobs[0]['offset'] > 0: + break + time.sleep(wait_step) + t += wait_step -vm.qmp_log('blockdev-add', filters=[iotests.filter_qmp_testfiles], - **{'node_name': 'backup0', - 'driver': 'raw', - 'file': {'driver': 'nbd', - 'server': {'type': 'unix', 'path': nbd_sock}, - 'reconnect-delay': 10}}) -vm.qmp_log('blockdev-backup', device='drive0', sync='full', target='backup0', - speed=(1 * 1024 * 1024)) - -# Wait for some progress -t = 0 -while t < wait_limit: - jobs = vm.qmp('query-block-jobs')['return'] if jobs and jobs[0]['offset'] > 0: - break - time.sleep(wait_step) - t += wait_step - -if jobs and jobs[0]['offset'] > 0: - log('Backup job is started') - -log('Kill NBD server') -srv.kill() -srv.wait() + log('Backup job is started') jobs = vm.qmp('query-block-jobs')['return'] if jobs and jobs[0]['offset'] < jobs[0]['len']: @@ -88,12 +72,8 @@ vm.qmp_log('block-job-set-speed', device='drive0', speed=0) # Emulate server down time for 1 second time.sleep(1) -log('Start NBD server') -srv = qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b) - -e = vm.event_wait('BLOCK_JOB_COMPLETED') -log('Backup completed: {}'.format(e['data']['offset'])) - -vm.qmp_log('blockdev-del', node_name='backup0') -srv.kill() -vm.shutdown() +with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b): + e = vm.event_wait('BLOCK_JOB_COMPLETED') + log('Backup completed: {}'.format(e['data']['offset'])) + vm.qmp_log('blockdev-del', node_name='backup0') + vm.shutdown() diff --git a/tests/qemu-iotests/264.out b/tests/qemu-iotests/264.out index 3000944b09..c45b1e81ef 100644 --- a/tests/qemu-iotests/264.out +++ b/tests/qemu-iotests/264.out @@ -1,3 +1,4 @@ +Start NBD server {"execute": "blockdev-add", "arguments": {"driver": "raw", "file": {"driver": "nbd", "reconnect-delay": 10, "server": {"path": "TEST_DIR/PID-nbd-sock", "type": "unix"}}, "node-name": "backup0"}} {"return": {}} {"execute": "blockdev-backup", "arguments": {"device": "drive0", "speed": 1048576, "sync": "full", "target": "backup0"}} @@ -11,3 +12,4 @@ Start NBD server Backup completed: 5242880 {"execute": "blockdev-del", "arguments": {"node-name": "backup0"}} {"return": {}} +Kill NBD server diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 3590ed78a0..8f79668435 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -28,10 +28,13 @@ import struct import subprocess import sys +import time from typing import (Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, TypeVar) import unittest +from contextlib import contextmanager + # pylint: disable=import-error, wrong-import-position sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'python')) from qemu import qtest @@ -270,9 +273,30 @@ def qemu_nbd_early_pipe(*args): return subp.returncode, output if subp.returncode else '' +@contextmanager def qemu_nbd_popen(*args): - '''Run qemu-nbd in daemon mode and return the parent's exit code''' - return subprocess.Popen(qemu_nbd_args + ['--persistent'] + list(args)) + '''Context manager running qemu-nbd within the context''' + pid_file = file_path("pid") + + cmd = list(qemu_nbd_args) + cmd.extend(('--persistent', '--pid-file', pid_file)) + cmd.extend(args) + + log('Start NBD server') + p = subprocess.Popen(cmd) + try: + while not os.path.exists(pid_file): + if p.poll() is not None: + raise RuntimeError( + "qemu-nbd terminated with exit code {}: {}" + .format(p.returncode, ' '.join(cmd))) + + time.sleep(0.01) + yield + finally: + log('Kill NBD server') + p.kill() + p.wait() def compare_images(img1, img2, fmt1=imgfmt, fmt2=imgfmt): '''Return True if two image files are identical''' From 4b914b01cdb95fe5722cb70826db7f0a262e8d2a Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Tue, 28 Jul 2020 00:58:45 +0300 Subject: [PATCH 4/9] iotests: Add more qemu_img helpers Add 2 helpers for measuring and checking images: - qemu_img_measure() - qemu_img_check() Both use --output-json and parse the returned json to make easy to use in other tests. I'm going to use them in a new test, and I hope they will be useful in may other tests. Signed-off-by: Nir Soffer Message-Id: <20200727215846.395443-4-nsoffer@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Eric Blake --- tests/qemu-iotests/iotests.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 8f79668435..717b5b652c 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -141,6 +141,12 @@ def qemu_img_create(*args): return qemu_img(*args) +def qemu_img_measure(*args): + return json.loads(qemu_img_pipe("measure", "--output", "json", *args)) + +def qemu_img_check(*args): + return json.loads(qemu_img_pipe("check", "--output", "json", *args)) + def qemu_img_verbose(*args): '''Run qemu-img without suppressing its output and return the exit code''' exitcode = subprocess.call(qemu_img_args + list(args)) From 03a970bb6ffe17a3ae77d54d4127330cf9a73587 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Tue, 28 Jul 2020 00:58:46 +0300 Subject: [PATCH 5/9] iotests: Test convert to qcow2 compressed to NBD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add test for "qemu-img convert -O qcow2 -c" to NBD target. The tests     create a OVA file and write compressed qcow2 disk content directly into the OVA file via qemu-nbd. Signed-off-by: Nir Soffer Message-Id: <20200727215846.395443-5-nsoffer@redhat.com> Tested-by: Eric Blake Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- tests/qemu-iotests/302 | 127 +++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/302.out | 31 +++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 159 insertions(+) create mode 100755 tests/qemu-iotests/302 create mode 100644 tests/qemu-iotests/302.out diff --git a/tests/qemu-iotests/302 b/tests/qemu-iotests/302 new file mode 100755 index 0000000000..a8506bda15 --- /dev/null +++ b/tests/qemu-iotests/302 @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# +# Tests converting qcow2 compressed to NBD +# +# Copyright (c) 2020 Nir Soffer +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# owner=nirsof@gmail.com + +import io +import tarfile + +import iotests + +from iotests import ( + file_path, + qemu_img, + qemu_img_check, + qemu_img_create, + qemu_img_log, + qemu_img_measure, + qemu_io, + qemu_nbd_popen, +) + +iotests.script_initialize(supported_fmts=["qcow2"]) + +# Create source disk. Using qcow2 to enable strict comparing later, and +# avoid issues with random filesystem on CI environment. +src_disk = file_path("disk.qcow2") +qemu_img_create("-f", iotests.imgfmt, src_disk, "1g") +qemu_io("-f", iotests.imgfmt, "-c", "write 1m 64k", src_disk) + +# The use case is writing qcow2 image directly into an ova file, which +# is a tar file with specific layout. This is tricky since we don't know the +# size of the image before compressing, so we have to do: +# 1. Add an ovf file. +# 2. Find the offset of the next member data. +# 3. Make room for image data, allocating for the worst case. +# 4. Write compressed image data into the tar. +# 5. Add a tar entry with the actual image size. +# 6. Shrink the tar to the actual size, aligned to 512 bytes. + +tar_file = file_path("test.ova") + +with tarfile.open(tar_file, "w") as tar: + + # 1. Add an ovf file. + + ovf_data = b"" + ovf = tarfile.TarInfo("vm.ovf") + ovf.size = len(ovf_data) + tar.addfile(ovf, io.BytesIO(ovf_data)) + + # 2. Find the offset of the next member data. + + offset = tar.fileobj.tell() + 512 + + # 3. Make room for image data, allocating for the worst case. + + measure = qemu_img_measure("-O", "qcow2", src_disk) + tar.fileobj.truncate(offset + measure["required"]) + + # 4. Write compressed image data into the tar. + + nbd_sock = file_path("nbd-sock", base_dir=iotests.sock_dir) + nbd_uri = "nbd+unix:///exp?socket=" + nbd_sock + + # Use raw format to allow creating qcow2 directly into tar file. + with qemu_nbd_popen( + "--socket", nbd_sock, + "--export-name", "exp", + "--format", "raw", + "--offset", str(offset), + tar_file): + + iotests.log("=== Target image info ===") + qemu_img_log("info", nbd_uri) + + qemu_img( + "convert", + "-f", iotests.imgfmt, + "-O", "qcow2", + "-c", + src_disk, + nbd_uri) + + iotests.log("=== Converted image info ===") + qemu_img_log("info", nbd_uri) + + iotests.log("=== Converted image check ===") + qemu_img_log("check", nbd_uri) + + iotests.log("=== Comparing to source disk ===") + qemu_img_log("compare", src_disk, nbd_uri) + + actual_size = qemu_img_check(nbd_uri)["image-end-offset"] + + # 5. Add a tar entry with the actual image size. + + disk = tarfile.TarInfo("disk") + disk.size = actual_size + tar.addfile(disk) + + # 6. Shrink the tar to the actual size, aligned to 512 bytes. + + tar_size = offset + (disk.size + 511) & ~511 + tar.fileobj.seek(tar_size) + tar.fileobj.truncate(tar_size) + +with tarfile.open(tar_file) as tar: + members = [{"name": m.name, "size": m.size, "offset": m.offset_data} + for m in tar] + iotests.log("=== OVA file contents ===") + iotests.log(members) diff --git a/tests/qemu-iotests/302.out b/tests/qemu-iotests/302.out new file mode 100644 index 0000000000..e37d3a1030 --- /dev/null +++ b/tests/qemu-iotests/302.out @@ -0,0 +1,31 @@ +Start NBD server +=== Target image info === +image: nbd+unix:///exp?socket=SOCK_DIR/PID-nbd-sock +file format: raw +virtual size: 448 KiB (458752 bytes) +disk size: unavailable + +=== Converted image info === +image: nbd+unix:///exp?socket=SOCK_DIR/PID-nbd-sock +file format: qcow2 +virtual size: 1 GiB (1073741824 bytes) +disk size: unavailable +cluster_size: 65536 +Format specific information: + compat: 1.1 + compression type: zlib + lazy refcounts: false + refcount bits: 16 + corrupt: false + +=== Converted image check === +No errors were found on the image. +1/16384 = 0.01% allocated, 100.00% fragmented, 100.00% compressed clusters +Image end offset: 393216 + +=== Comparing to source disk === +Images are identical. + +Kill NBD server +=== OVA file contents === +[{"name": "vm.ovf", "offset": 512, "size": 6}, {"name": "disk", "offset": 1536, "size": 393216}] diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 1d0252e1f0..1e1cb27bc8 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -308,3 +308,4 @@ 297 meta 299 auto quick 301 backing quick +302 quick From fa35591b9cb9a7fd0af2d8c2d8848abba30d3c69 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 27 Jul 2020 21:47:47 +0300 Subject: [PATCH 6/9] block/nbd: split nbd_establish_connection out of nbd_client_connect We are going to implement non-blocking version of nbd_establish_connection, which for a while will be used only for nbd_reconnect_attempt, not for nbd_open, so we need to call it separately. Refactor nbd_reconnect_attempt in a way which makes next commit simpler. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200727184751.15704-2-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/nbd.c | 60 +++++++++++++++++++++++++++------------------- block/trace-events | 4 ++-- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index 8934bcb479..3558c173e3 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -93,7 +93,10 @@ typedef struct BDRVNBDState { char *x_dirty_bitmap; } BDRVNBDState; -static int nbd_client_connect(BlockDriverState *bs, Error **errp); +static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr, + Error **errp); +static int nbd_client_handshake(BlockDriverState *bs, QIOChannelSocket *sioc, + Error **errp); static void nbd_clear_bdrvstate(BDRVNBDState *s) { @@ -241,7 +244,9 @@ static bool nbd_client_connecting_wait(BDRVNBDState *s) static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) { + int ret; Error *local_err = NULL; + QIOChannelSocket *sioc; if (!nbd_client_connecting(s)) { return; @@ -280,19 +285,25 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) s->ioc = NULL; } - s->connect_status = nbd_client_connect(s->bs, &local_err); + sioc = nbd_establish_connection(s->saddr, &local_err); + if (!sioc) { + ret = -ECONNREFUSED; + goto out; + } + + ret = nbd_client_handshake(s->bs, sioc, &local_err); + +out: + s->connect_status = ret; error_free(s->connect_err); s->connect_err = NULL; error_propagate(&s->connect_err, local_err); - if (s->connect_status < 0) { - /* failed attempt */ - return; + if (ret >= 0) { + /* successfully connected */ + s->state = NBD_CLIENT_CONNECTED; + qemu_co_queue_restart_all(&s->free_sema); } - - /* successfully connected */ - s->state = NBD_CLIENT_CONNECTED; - qemu_co_queue_restart_all(&s->free_sema); } static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s) @@ -1425,24 +1436,15 @@ static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr, return sioc; } -static int nbd_client_connect(BlockDriverState *bs, Error **errp) +/* nbd_client_handshake takes ownership on sioc. On failure it is unref'ed. */ +static int nbd_client_handshake(BlockDriverState *bs, QIOChannelSocket *sioc, + Error **errp) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; AioContext *aio_context = bdrv_get_aio_context(bs); int ret; - /* - * establish TCP connection, return error if it fails - * TODO: Configurable retry-until-timeout behaviour. - */ - QIOChannelSocket *sioc = nbd_establish_connection(s->saddr, errp); - - if (!sioc) { - return -ECONNREFUSED; - } - - /* NBD handshake */ - trace_nbd_client_connect(s->export); + trace_nbd_client_handshake(s->export); qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL); qio_channel_attach_aio_context(QIO_CHANNEL(sioc), aio_context); @@ -1489,7 +1491,7 @@ static int nbd_client_connect(BlockDriverState *bs, Error **errp) object_ref(OBJECT(s->ioc)); } - trace_nbd_client_connect_success(s->export); + trace_nbd_client_handshake_success(s->export); return 0; @@ -1894,6 +1896,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, { int ret; BDRVNBDState *s = (BDRVNBDState *)bs->opaque; + QIOChannelSocket *sioc; ret = nbd_process_options(bs, options, errp); if (ret < 0) { @@ -1904,7 +1907,16 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, qemu_co_mutex_init(&s->send_mutex); qemu_co_queue_init(&s->free_sema); - ret = nbd_client_connect(bs, errp); + /* + * establish TCP connection, return error if it fails + * TODO: Configurable retry-until-timeout behaviour. + */ + sioc = nbd_establish_connection(s->saddr, errp); + if (!sioc) { + return -ECONNREFUSED; + } + + ret = nbd_client_handshake(bs, sioc, errp); if (ret < 0) { nbd_clear_bdrvstate(s); return ret; diff --git a/block/trace-events b/block/trace-events index d3533ca896..9158335061 100644 --- a/block/trace-events +++ b/block/trace-events @@ -168,8 +168,8 @@ nbd_parse_blockstatus_compliance(const char *err) "ignoring extra data from non- nbd_structured_read_compliance(const char *type) "server sent non-compliant unaligned read %s chunk" nbd_read_reply_entry_fail(int ret, const char *err) "ret = %d, err: %s" nbd_co_request_fail(uint64_t from, uint32_t len, uint64_t handle, uint16_t flags, uint16_t type, const char *name, int ret, const char *err) "Request failed { .from = %" PRIu64", .len = %" PRIu32 ", .handle = %" PRIu64 ", .flags = 0x%" PRIx16 ", .type = %" PRIu16 " (%s) } ret = %d, err: %s" -nbd_client_connect(const char *export_name) "export '%s'" -nbd_client_connect_success(const char *export_name) "export '%s'" +nbd_client_handshake(const char *export_name) "export '%s'" +nbd_client_handshake_success(const char *export_name) "export '%s'" # ssh.c ssh_restart_coroutine(void *co) "co=%p" From dd1ec1a4afe190e030edfa052d95c9e6e065438c Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 27 Jul 2020 21:47:48 +0300 Subject: [PATCH 7/9] block/nbd: allow drain during reconnect attempt It should be safe to reenter qio_channel_yield() on io/channel read/write path, so it's safe to reduce in_flight and allow attaching new aio context. And no problem to allow drain itself: connection attempt is not a guest request. Moreover, if remote server is down, we can hang in negotiation, blocking drain section and provoking a dead lock. How to reproduce the dead lock: 1. Create nbd-fault-injector.conf with the following contents: [inject-error "mega1"] event=data io=readwrite when=before 2. In one terminal run nbd-fault-injector in a loop, like this: n=1; while true; do echo $n; ((n++)); ./nbd-fault-injector.py 127.0.0.1:10000 nbd-fault-injector.conf; done 3. In another terminal run qemu-io in a loop, like this: n=1; while true; do echo $n; ((n++)); ./qemu-io -c 'read 0 512' nbd://127.0.0.1:10000; done After some time, qemu-io will hang trying to drain, for example, like this: #3 aio_poll (ctx=0x55f006bdd890, blocking=true) at util/aio-posix.c:600 #4 bdrv_do_drained_begin (bs=0x55f006bea710, recursive=false, parent=0x0, ignore_bds_parents=false, poll=true) at block/io.c:427 #5 bdrv_drained_begin (bs=0x55f006bea710) at block/io.c:433 #6 blk_drain (blk=0x55f006befc80) at block/block-backend.c:1710 #7 blk_unref (blk=0x55f006befc80) at block/block-backend.c:498 #8 bdrv_open_inherit (filename=0x7fffba1563bc "nbd+tcp://127.0.0.1:10000", reference=0x0, options=0x55f006be86d0, flags=24578, parent=0x0, child_class=0x0, child_role=0, errp=0x7fffba154620) at block.c:3491 #9 bdrv_open (filename=0x7fffba1563bc "nbd+tcp://127.0.0.1:10000", reference=0x0, options=0x0, flags=16386, errp=0x7fffba154620) at block.c:3513 #10 blk_new_open (filename=0x7fffba1563bc "nbd+tcp://127.0.0.1:10000", reference=0x0, options=0x0, flags=16386, errp=0x7fffba154620) at block/block-backend.c:421 And connection_co stack like this: #0 qemu_coroutine_switch (from_=0x55f006bf2650, to_=0x7fe96e07d918, action=COROUTINE_YIELD) at util/coroutine-ucontext.c:302 #1 qemu_coroutine_yield () at util/qemu-coroutine.c:193 #2 qio_channel_yield (ioc=0x55f006bb3c20, condition=G_IO_IN) at io/channel.c:472 #3 qio_channel_readv_all_eof (ioc=0x55f006bb3c20, iov=0x7fe96d729bf0, niov=1, errp=0x7fe96d729eb0) at io/channel.c:110 #4 qio_channel_readv_all (ioc=0x55f006bb3c20, iov=0x7fe96d729bf0, niov=1, errp=0x7fe96d729eb0) at io/channel.c:143 #5 qio_channel_read_all (ioc=0x55f006bb3c20, buf=0x7fe96d729d28 "\300.\366\004\360U", buflen=8, errp=0x7fe96d729eb0) at io/channel.c:247 #6 nbd_read (ioc=0x55f006bb3c20, buffer=0x7fe96d729d28, size=8, desc=0x55f004f69644 "initial magic", errp=0x7fe96d729eb0) at /work/src/qemu/master/include/block/nbd.h:365 #7 nbd_read64 (ioc=0x55f006bb3c20, val=0x7fe96d729d28, desc=0x55f004f69644 "initial magic", errp=0x7fe96d729eb0) at /work/src/qemu/master/include/block/nbd.h:391 #8 nbd_start_negotiate (aio_context=0x55f006bdd890, ioc=0x55f006bb3c20, tlscreds=0x0, hostname=0x0, outioc=0x55f006bf19f8, structured_reply=true, zeroes=0x7fe96d729dca, errp=0x7fe96d729eb0) at nbd/client.c:904 #9 nbd_receive_negotiate (aio_context=0x55f006bdd890, ioc=0x55f006bb3c20, tlscreds=0x0, hostname=0x0, outioc=0x55f006bf19f8, info=0x55f006bf1a00, errp=0x7fe96d729eb0) at nbd/client.c:1032 #10 nbd_client_connect (bs=0x55f006bea710, errp=0x7fe96d729eb0) at block/nbd.c:1460 #11 nbd_reconnect_attempt (s=0x55f006bf19f0) at block/nbd.c:287 #12 nbd_co_reconnect_loop (s=0x55f006bf19f0) at block/nbd.c:309 #13 nbd_connection_entry (opaque=0x55f006bf19f0) at block/nbd.c:360 #14 coroutine_trampoline (i0=113190480, i1=22000) at util/coroutine-ucontext.c:173 Note, that the hang may be triggered by another bug, so the whole case is fixed only together with commit "block/nbd: on shutdown terminate connection attempt". Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200727184751.15704-3-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Eric Blake --- block/nbd.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/block/nbd.c b/block/nbd.c index 3558c173e3..ee9ab7512b 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -291,8 +291,22 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) goto out; } + bdrv_dec_in_flight(s->bs); + ret = nbd_client_handshake(s->bs, sioc, &local_err); + if (s->drained) { + s->wait_drained_end = true; + while (s->drained) { + /* + * We may be entered once from nbd_client_attach_aio_context_bh + * and then from nbd_client_co_drain_end. So here is a loop. + */ + qemu_coroutine_yield(); + } + } + bdrv_inc_in_flight(s->bs); + out: s->connect_status = ret; error_free(s->connect_err); From fbeb3e63b34a1af4a968031de1c82e5edf20bf6c Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 27 Jul 2020 21:47:49 +0300 Subject: [PATCH 8/9] block/nbd: on shutdown terminate connection attempt On shutdown nbd driver may be in a connecting state. We should shutdown it as well, otherwise we may hang in nbd_teardown_connection, waiting for conneciton_co to finish in BDRV_POLL_WHILE(bs, s->connection_co) loop if remote server is down. How to reproduce the dead lock: 1. Create nbd-fault-injector.conf with the following contents: [inject-error "mega1"] event=data io=readwrite when=before 2. In one terminal run nbd-fault-injector in a loop, like this: n=1; while true; do echo $n; ((n++)); ./nbd-fault-injector.py 127.0.0.1:10000 nbd-fault-injector.conf; done 3. In another terminal run qemu-io in a loop, like this: n=1; while true; do echo $n; ((n++)); ./qemu-io -c 'read 0 512' nbd://127.0.0.1:10000; done After some time, qemu-io will hang. Note, that this hang may be triggered by another bug, so the whole case is fixed only together with commit "block/nbd: allow drain during reconnect attempt". Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Message-Id: <20200727184751.15704-4-vsementsov@virtuozzo.com> Signed-off-by: Eric Blake --- block/nbd.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index ee9ab7512b..620c97be6b 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -209,11 +209,15 @@ static void nbd_teardown_connection(BlockDriverState *bs) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - if (s->state == NBD_CLIENT_CONNECTED) { + if (s->ioc) { /* finish any pending coroutines */ - assert(s->ioc); qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + } else if (s->sioc) { + /* abort negotiation */ + qio_channel_shutdown(QIO_CHANNEL(s->sioc), QIO_CHANNEL_SHUTDOWN_BOTH, + NULL); } + s->state = NBD_CLIENT_QUIT; if (s->connection_co) { if (s->connection_co_sleep_ns_state) { @@ -1459,6 +1463,9 @@ static int nbd_client_handshake(BlockDriverState *bs, QIOChannelSocket *sioc, int ret; trace_nbd_client_handshake(s->export); + + s->sioc = sioc; + qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL); qio_channel_attach_aio_context(QIO_CHANNEL(sioc), aio_context); @@ -1473,6 +1480,7 @@ static int nbd_client_handshake(BlockDriverState *bs, QIOChannelSocket *sioc, g_free(s->info.name); if (ret < 0) { object_unref(OBJECT(sioc)); + s->sioc = NULL; return ret; } if (s->x_dirty_bitmap && !s->info.base_allocation) { @@ -1498,8 +1506,6 @@ static int nbd_client_handshake(BlockDriverState *bs, QIOChannelSocket *sioc, } } - s->sioc = sioc; - if (!s->ioc) { s->ioc = QIO_CHANNEL(sioc); object_ref(OBJECT(s->ioc)); @@ -1520,6 +1526,7 @@ static int nbd_client_handshake(BlockDriverState *bs, QIOChannelSocket *sioc, nbd_send_request(s->ioc ?: QIO_CHANNEL(sioc), &request); object_unref(OBJECT(sioc)); + s->sioc = NULL; return ret; } From 12c75e20a269ac917f4a76936d7142264e522233 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 27 Jul 2020 21:47:50 +0300 Subject: [PATCH 9/9] block/nbd: nbd_co_reconnect_loop(): don't sleep if drained We try to go to wakeable sleep, so that, if drain begins it will break the sleep. But what if nbd_client_co_drain_begin() already called and s->drained is already true? We'll go to sleep, and drain will have to wait for the whole timeout. Let's improve it. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Message-Id: <20200727184751.15704-5-vsementsov@virtuozzo.com> Signed-off-by: Eric Blake --- block/nbd.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index 620c97be6b..7bb881fef4 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -341,8 +341,6 @@ static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s) qemu_co_queue_restart_all(&s->free_sema); } - qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, timeout, - &s->connection_co_sleep_ns_state); if (s->drained) { bdrv_dec_in_flight(s->bs); s->wait_drained_end = true; @@ -354,9 +352,12 @@ static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s) qemu_coroutine_yield(); } bdrv_inc_in_flight(s->bs); - } - if (timeout < max_timeout) { - timeout *= 2; + } else { + qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, timeout, + &s->connection_co_sleep_ns_state); + if (timeout < max_timeout) { + timeout *= 2; + } } nbd_reconnect_attempt(s);