diff --git a/block/nbd.c b/block/nbd.c index 63dbfa807d..5853d85d60 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -110,6 +110,10 @@ static void nbd_clear_bdrvstate(BlockDriverState *bs) yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name)); + /* Must not leave timers behind that would access freed data */ + assert(!s->reconnect_delay_timer); + assert(!s->open_timer); + object_unref(OBJECT(s->tlscreds)); qapi_free_SocketAddress(s->saddr); s->saddr = NULL; @@ -381,6 +385,13 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) } nbd_co_do_establish_connection(s->bs, NULL); + + /* + * The reconnect attempt is done (maybe successfully, maybe not), so + * we no longer need this timer. Delete it so it will not outlive + * this I/O request (so draining removes all timers). + */ + reconnect_delay_timer_del(s); } static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t handle) @@ -1878,11 +1889,19 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } + /* + * The connect attempt is done, so we no longer need this timer. + * Delete it, because we do not want it to be around when this node + * is drained or closed. + */ + open_timer_del(s); + nbd_client_connection_enable_retry(s->conn); return 0; fail: + open_timer_del(s); nbd_clear_bdrvstate(bs); return ret; } @@ -2036,6 +2055,42 @@ static void nbd_cancel_in_flight(BlockDriverState *bs) nbd_co_establish_connection_cancel(s->conn); } +static void nbd_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVNBDState *s = bs->opaque; + + /* The open_timer is used only during nbd_open() */ + assert(!s->open_timer); + + /* + * The reconnect_delay_timer is scheduled in I/O paths when the + * connection is lost, to cancel the reconnection attempt after a + * given time. Once this attempt is done (successfully or not), + * nbd_reconnect_attempt() ensures the timer is deleted before the + * respective I/O request is resumed. + * Since the AioContext can only be changed when a node is drained, + * the reconnect_delay_timer cannot be active here. + */ + assert(!s->reconnect_delay_timer); + + if (s->ioc) { + qio_channel_attach_aio_context(s->ioc, new_context); + } +} + +static void nbd_detach_aio_context(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + + assert(!s->open_timer); + assert(!s->reconnect_delay_timer); + + if (s->ioc) { + qio_channel_detach_aio_context(s->ioc); + } +} + static BlockDriver bdrv_nbd = { .format_name = "nbd", .protocol_name = "nbd", @@ -2059,6 +2114,9 @@ static BlockDriver bdrv_nbd = { .bdrv_dirname = nbd_dirname, .strong_runtime_opts = nbd_strong_runtime_opts, .bdrv_cancel_in_flight = nbd_cancel_in_flight, + + .bdrv_attach_aio_context = nbd_attach_aio_context, + .bdrv_detach_aio_context = nbd_detach_aio_context, }; static BlockDriver bdrv_nbd_tcp = { @@ -2084,6 +2142,9 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_dirname = nbd_dirname, .strong_runtime_opts = nbd_strong_runtime_opts, .bdrv_cancel_in_flight = nbd_cancel_in_flight, + + .bdrv_attach_aio_context = nbd_attach_aio_context, + .bdrv_detach_aio_context = nbd_detach_aio_context, }; static BlockDriver bdrv_nbd_unix = { @@ -2109,6 +2170,9 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_dirname = nbd_dirname, .strong_runtime_opts = nbd_strong_runtime_opts, .bdrv_cancel_in_flight = nbd_cancel_in_flight, + + .bdrv_attach_aio_context = nbd_attach_aio_context, + .bdrv_detach_aio_context = nbd_detach_aio_context, }; static void bdrv_nbd_init(void) diff --git a/tests/qemu-iotests/281 b/tests/qemu-iotests/281 index 318e333939..5e1339bd75 100755 --- a/tests/qemu-iotests/281 +++ b/tests/qemu-iotests/281 @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# group: rw quick +# group: rw # # Test cases for blockdev + IOThread interactions # @@ -20,8 +20,9 @@ # import os +import time import iotests -from iotests import qemu_img +from iotests import qemu_img, QemuStorageDaemon image_len = 64 * 1024 * 1024 @@ -243,6 +244,102 @@ class TestBlockdevBackupAbort(iotests.QMPTestCase): # Hangs on failure, we expect this error. self.assert_qmp(result, 'error/class', 'GenericError') +# Test for RHBZ#2033626 +class TestYieldingAndTimers(iotests.QMPTestCase): + sock = os.path.join(iotests.sock_dir, 'nbd.sock') + qsd = None + + def setUp(self): + self.create_nbd_export() + + # Simple VM with an NBD block device connected to the NBD export + # provided by the QSD, and an (initially unused) iothread + self.vm = iotests.VM() + self.vm.add_object('iothread,id=iothr') + self.vm.add_blockdev('nbd,node-name=nbd,server.type=unix,' + + f'server.path={self.sock},export=exp,' + + 'reconnect-delay=1,open-timeout=1') + + self.vm.launch() + + def tearDown(self): + self.stop_nbd_export() + self.vm.shutdown() + + def test_timers_with_blockdev_del(self): + # The NBD BDS will have had an active open timer, because setUp() gave + # a positive value for @open-timeout. It should be gone once the BDS + # has been opened. + # (But there used to be a bug where it remained active, which will + # become important below.) + + # Stop and restart the NBD server, and do some I/O on the client to + # trigger a reconnect and start the reconnect delay timer + self.stop_nbd_export() + self.create_nbd_export() + + result = self.vm.qmp('human-monitor-command', + command_line='qemu-io nbd "write 0 512"') + self.assert_qmp(result, 'return', '') + + # Reconnect is done, so the reconnect delay timer should be gone. + # (This is similar to how the open timer should be gone after open, + # and similarly there used to be a bug where it was not gone.) + + # Delete the BDS to see whether both timers are gone. If they are not, + # they will remain active, fire later, and then access freed data. + # (Or, with "block/nbd: Assert there are no timers when closed" + # applied, the assertions added in that patch will fail.) + result = self.vm.qmp('blockdev-del', node_name='nbd') + self.assert_qmp(result, 'return', {}) + + # Give the timers some time to fire (both have a timeout of 1 s). + # (Sleeping in an iotest may ring some alarm bells, but note that if + # the timing is off here, the test will just always pass. If we kill + # the VM too early, then we just kill the timers before they can fire, + # thus not see the error, and so the test will pass.) + time.sleep(2) + + def test_yield_in_iothread(self): + # Move the NBD node to the I/O thread; the NBD block driver should + # attach the connection's QIOChannel to that thread's AioContext, too + result = self.vm.qmp('x-blockdev-set-iothread', + node_name='nbd', iothread='iothr') + self.assert_qmp(result, 'return', {}) + + # Do some I/O that will be throttled by the QSD, so that the network + # connection hopefully will yield here. When it is resumed, it must + # then be resumed in the I/O thread's AioContext. + result = self.vm.qmp('human-monitor-command', + command_line='qemu-io nbd "read 0 128K"') + self.assert_qmp(result, 'return', '') + + def create_nbd_export(self): + assert self.qsd is None + + # Export a throttled null-co BDS: Reads are throttled (max 64 kB/s), + # writes are not. + self.qsd = QemuStorageDaemon( + '--object', + 'throttle-group,id=thrgr,x-bps-read=65536,x-bps-read-max=65536', + + '--blockdev', + 'null-co,node-name=null,read-zeroes=true', + + '--blockdev', + 'throttle,node-name=thr,file=null,throttle-group=thrgr', + + '--nbd-server', + f'addr.type=unix,addr.path={self.sock}', + + '--export', + 'nbd,id=exp,node-name=thr,name=exp,writable=true' + ) + + def stop_nbd_export(self): + self.qsd.stop() + self.qsd = None + if __name__ == '__main__': iotests.main(supported_fmts=['qcow2'], supported_protocols=['file'], diff --git a/tests/qemu-iotests/281.out b/tests/qemu-iotests/281.out index 89968f35d7..3f8a935a08 100644 --- a/tests/qemu-iotests/281.out +++ b/tests/qemu-iotests/281.out @@ -1,5 +1,5 @@ -.... +...... ---------------------------------------------------------------------- -Ran 4 tests +Ran 6 tests OK diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 8cdb381f2a..6ba65eb1ff 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -73,6 +73,8 @@ qemu_prog = os.environ.get('QEMU_PROG', 'qemu') qemu_opts = os.environ.get('QEMU_OPTIONS', '').strip().split(' ') +qsd_prog = os.environ.get('QSD_PROG', 'qemu-storage-daemon') + gdb_qemu_env = os.environ.get('GDB_OPTIONS') qemu_gdb = [] if gdb_qemu_env: @@ -345,6 +347,44 @@ def cmd(self, cmd): return self._read_output() +class QemuStorageDaemon: + def __init__(self, *args: str, instance_id: str = 'a'): + assert '--pidfile' not in args + self.pidfile = os.path.join(test_dir, f'qsd-{instance_id}-pid') + all_args = [qsd_prog] + list(args) + ['--pidfile', self.pidfile] + + # Cannot use with here, we want the subprocess to stay around + # pylint: disable=consider-using-with + self._p = subprocess.Popen(all_args) + while not os.path.exists(self.pidfile): + if self._p.poll() is not None: + cmd = ' '.join(all_args) + raise RuntimeError( + 'qemu-storage-daemon terminated with exit code ' + + f'{self._p.returncode}: {cmd}') + + time.sleep(0.01) + + with open(self.pidfile, encoding='utf-8') as f: + self._pid = int(f.read().strip()) + + assert self._pid == self._p.pid + + def stop(self, kill_signal=15): + self._p.send_signal(kill_signal) + self._p.wait() + self._p = None + + try: + os.remove(self.pidfile) + except OSError: + pass + + def __del__(self): + if self._p is not None: + self.stop(kill_signal=9) + + def qemu_nbd(*args): '''Run qemu-nbd in daemon mode and return the parent's exit code''' return subprocess.call(qemu_nbd_args + ['--fork'] + list(args))