From ddd633e525fec68437d04b074130aedc9d461331 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 21 May 2020 16:36:16 +0100 Subject: [PATCH 01/12] minikconf: explicitly set encoding to UTF-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QEMU currently only has ASCII Kconfig files but Linux actually uses UTF-8. Explicitly specify the encoding and that we're doing text file I/O. It's unclear whether or not QEMU will ever need Unicode in its Kconfig files. If we start using the help text then it will become an issue sooner or later. Make this change now for consistency with Linux Kconfig. Reported-by: Philippe Mathieu-Daudé Signed-off-by: Stefan Hajnoczi Reviewed-by: Richard Henderson Message-id: 20200521153616.307100-1-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- scripts/minikconf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/minikconf.py b/scripts/minikconf.py index 90b99517c1..bcd91015d3 100755 --- a/scripts/minikconf.py +++ b/scripts/minikconf.py @@ -402,7 +402,7 @@ def do_include(self, include): if incl_abs_fname in self.data.previously_included: return try: - fp = open(incl_abs_fname, 'r') + fp = open(incl_abs_fname, 'rt', encoding='utf-8') except IOError as e: raise KconfigParserError(self, '%s: %s' % (e.strerror, include)) @@ -696,7 +696,7 @@ def scan_token(self): parser.do_assignment(name, value == 'y') external_vars.add(name[7:]) else: - fp = open(arg, 'r') + fp = open(arg, 'rt', encoding='utf-8') parser.parse_file(fp) fp.close() @@ -705,7 +705,7 @@ def scan_token(self): if key not in external_vars and config[key]: print ('CONFIG_%s=y' % key) - deps = open(argv[2], 'w') + deps = open(argv[2], 'wt', encoding='utf-8') for fname in data.previously_included: print ('%s: %s' % (argv[1], fname), file=deps) deps.close() From 58ebc2c31337734a8a79b0566b31b19040deb2ea Mon Sep 17 00:00:00 2001 From: Daniele Buono Date: Fri, 29 May 2020 16:51:19 -0400 Subject: [PATCH 02/12] coroutine: support SafeStack in ucontext backend LLVM's SafeStack instrumentation does not yet support programs that make use of the APIs in ucontext.h With the current implementation of coroutine-ucontext, the resulting binary is incorrect, with different coroutines sharing the same unsafe stack and producing undefined behavior at runtime. This fix allocates an additional unsafe stack area for each coroutine, and sets the new unsafe stack pointer before calling swapcontext() in qemu_coroutine_new. This is the only place where the pointer needs to be manually updated, since sigsetjmp/siglongjmp are already instrumented by LLVM to properly support SafeStack. The additional stack is then freed in qemu_coroutine_delete. Signed-off-by: Daniele Buono Message-id: 20200529205122.714-2-dbuono@linux.vnet.ibm.com Signed-off-by: Stefan Hajnoczi --- include/qemu/coroutine_int.h | 5 +++++ util/coroutine-ucontext.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h index bd6b0468e1..1da148552f 100644 --- a/include/qemu/coroutine_int.h +++ b/include/qemu/coroutine_int.h @@ -28,6 +28,11 @@ #include "qemu/queue.h" #include "qemu/coroutine.h" +#ifdef CONFIG_SAFESTACK +/* Pointer to the unsafe stack, defined by the compiler */ +extern __thread void *__safestack_unsafe_stack_ptr; +#endif + #define COROUTINE_STACK_SIZE (1 << 20) typedef enum { diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c index 613f4c118e..f0b66320e1 100644 --- a/util/coroutine-ucontext.c +++ b/util/coroutine-ucontext.c @@ -45,6 +45,11 @@ typedef struct { Coroutine base; void *stack; size_t stack_size; +#ifdef CONFIG_SAFESTACK + /* Need an unsafe stack for each coroutine */ + void *unsafe_stack; + size_t unsafe_stack_size; +#endif sigjmp_buf env; void *tsan_co_fiber; @@ -179,6 +184,10 @@ Coroutine *qemu_coroutine_new(void) co = g_malloc0(sizeof(*co)); co->stack_size = COROUTINE_STACK_SIZE; co->stack = qemu_alloc_stack(&co->stack_size); +#ifdef CONFIG_SAFESTACK + co->unsafe_stack_size = COROUTINE_STACK_SIZE; + co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size); +#endif co->base.entry_arg = &old_env; /* stash away our jmp_buf */ uc.uc_link = &old_uc; @@ -203,6 +212,22 @@ Coroutine *qemu_coroutine_new(void) COROUTINE_YIELD, &fake_stack_save, co->stack, co->stack_size, co->tsan_co_fiber); + +#ifdef CONFIG_SAFESTACK + /* + * Before we swap the context, set the new unsafe stack + * The unsafe stack grows just like the normal stack, so start from + * the last usable location of the memory area. + * NOTE: we don't have to re-set the usp afterwards because we are + * coming back to this context through a siglongjmp. + * The compiler already wrapped the corresponding sigsetjmp call with + * code that saves the usp on the (safe) stack before the call, and + * restores it right after (which is where we return with siglongjmp). + */ + void *usp = co->unsafe_stack + co->unsafe_stack_size; + __safestack_unsafe_stack_ptr = usp; +#endif + swapcontext(&old_uc, &uc); } @@ -235,6 +260,9 @@ void qemu_coroutine_delete(Coroutine *co_) #endif qemu_free_stack(co->stack, co->stack_size); +#ifdef CONFIG_SAFESTACK + qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size); +#endif g_free(co); } From ff76097ad8f7fdc9d1d707bed85c146fdbb5a16d Mon Sep 17 00:00:00 2001 From: Daniele Buono Date: Fri, 29 May 2020 16:51:20 -0400 Subject: [PATCH 03/12] coroutine: add check for SafeStack in sigaltstack Current implementation of LLVM's SafeStack is not compatible with code that uses an alternate stack created with sigaltstack(). Since coroutine-sigaltstack relies on sigaltstack(), it is not compatible with SafeStack. The resulting binary is incorrect, with different coroutines sharing the same unsafe stack and producing undefined behavior at runtime. In the future LLVM may provide a SafeStack implementation compatible with sigaltstack(). In the meantime, if SafeStack is desired, the coroutine implementation from coroutine-ucontext should be used. As a safety check, add a control in coroutine-sigaltstack to throw a preprocessor #error if SafeStack is enabled and we are trying to use coroutine-sigaltstack to implement coroutines. Signed-off-by: Daniele Buono Message-id: 20200529205122.714-3-dbuono@linux.vnet.ibm.com Signed-off-by: Stefan Hajnoczi --- util/coroutine-sigaltstack.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c index f6fc49a0e5..aade82afb8 100644 --- a/util/coroutine-sigaltstack.c +++ b/util/coroutine-sigaltstack.c @@ -30,6 +30,10 @@ #include "qemu-common.h" #include "qemu/coroutine_int.h" +#ifdef CONFIG_SAFESTACK +#error "SafeStack is not compatible with code run in alternate signal stacks" +#endif + typedef struct { Coroutine base; void *stack; From 1e4f6065da160977f212270893372436b8f13336 Mon Sep 17 00:00:00 2001 From: Daniele Buono Date: Fri, 29 May 2020 16:51:21 -0400 Subject: [PATCH 04/12] configure: add flags to support SafeStack This patch adds a flag to enable/disable the SafeStack instrumentation provided by LLVM. On enable, make sure that the compiler supports the flags, and that we are using the proper coroutine implementation (coroutine-ucontext). On disable, explicitly disable the option if it was enabled by default. While SafeStack is supported only on Linux, NetBSD, FreeBSD and macOS, we are not checking for the O.S. since this is already done by LLVM. Signed-off-by: Daniele Buono Message-id: 20200529205122.714-4-dbuono@linux.vnet.ibm.com Signed-off-by: Stefan Hajnoczi --- configure | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/configure b/configure index ba88fd1824..ae8737d5a2 100755 --- a/configure +++ b/configure @@ -307,6 +307,7 @@ audio_win_int="" libs_qga="" debug_info="yes" stack_protector="" +safe_stack="" use_containers="yes" gdb_bin=$(command -v "gdb-multiarch" || command -v "gdb") @@ -1287,6 +1288,10 @@ for opt do ;; --disable-stack-protector) stack_protector="no" ;; + --enable-safe-stack) safe_stack="yes" + ;; + --disable-safe-stack) safe_stack="no" + ;; --disable-curses) curses="no" ;; --enable-curses) curses="yes" @@ -1829,6 +1834,8 @@ disabled with --disable-FEATURE, default is enabled if available: debug-tcg TCG debugging (default is disabled) debug-info debugging information sparse sparse checker + safe-stack SafeStack Stack Smash Protection. Depends on + clang/llvm >= 3.7 and requires coroutine backend ucontext. gnutls GNUTLS cryptography support nettle nettle cryptography support @@ -5573,6 +5580,67 @@ if test "$debug_stack_usage" = "yes"; then fi fi +################################################## +# SafeStack + + +if test "$safe_stack" = "yes"; then +cat > $TMPC << EOF +int main(int argc, char *argv[]) +{ +#if ! __has_feature(safe_stack) +#error SafeStack Disabled +#endif + return 0; +} +EOF + flag="-fsanitize=safe-stack" + # Check that safe-stack is supported and enabled. + if compile_prog "-Werror $flag" "$flag"; then + # Flag needed both at compilation and at linking + QEMU_CFLAGS="$QEMU_CFLAGS $flag" + QEMU_LDFLAGS="$QEMU_LDFLAGS $flag" + else + error_exit "SafeStack not supported by your compiler" + fi + if test "$coroutine" != "ucontext"; then + error_exit "SafeStack is only supported by the coroutine backend ucontext" + fi +else +cat > $TMPC << EOF +int main(int argc, char *argv[]) +{ +#if defined(__has_feature) +#if __has_feature(safe_stack) +#error SafeStack Enabled +#endif +#endif + return 0; +} +EOF +if test "$safe_stack" = "no"; then + # Make sure that safe-stack is disabled + if ! compile_prog "-Werror" ""; then + # SafeStack was already enabled, try to explicitly remove the feature + flag="-fno-sanitize=safe-stack" + if ! compile_prog "-Werror $flag" "$flag"; then + error_exit "Configure cannot disable SafeStack" + fi + QEMU_CFLAGS="$QEMU_CFLAGS $flag" + QEMU_LDFLAGS="$QEMU_LDFLAGS $flag" + fi +else # "$safe_stack" = "" + # Set safe_stack to yes or no based on pre-existing flags + if compile_prog "-Werror" ""; then + safe_stack="no" + else + safe_stack="yes" + if test "$coroutine" != "ucontext"; then + error_exit "SafeStack is only supported by the coroutine backend ucontext" + fi + fi +fi +fi ########################################## # check if we have open_by_handle_at @@ -6765,6 +6833,7 @@ echo "sparse enabled $sparse" echo "strip binaries $strip_opt" echo "profiler $profiler" echo "static build $static" +echo "safe stack $safe_stack" if test "$darwin" = "yes" ; then echo "Cocoa support $cocoa" fi @@ -8370,6 +8439,10 @@ if test "$ccache_cpp2" = "yes"; then echo "export CCACHE_CPP2=y" >> $config_host_mak fi +if test "$safe_stack" = "yes"; then + echo "CONFIG_SAFESTACK=y" >> $config_host_mak +fi + # If we're using a separate build tree, set it up now. # DIRS are directories which we simply mkdir in the build tree; # LINKS are things to symlink back into the source tree From d6d1a65ccab670788b6c30d918ac7bd636513f8e Mon Sep 17 00:00:00 2001 From: Daniele Buono Date: Fri, 29 May 2020 16:51:22 -0400 Subject: [PATCH 05/12] check-block: enable iotests with SafeStack SafeStack is a stack protection technique implemented in llvm. It is enabled with a -fsanitize flag. iotests are currently disabled when any -fsanitize option is used, because such options tend to produce additional warnings and false positives. While common -fsanitize options are used to verify the code and not added in production, SafeStack's main use is in production environments to protect against stack smashing. Since SafeStack does not print any warning or false positive, enable iotests when SafeStack is the only -fsanitize option used. This is likely going to be a production binary and we want to make sure it works correctly. Signed-off-by: Daniele Buono Message-id: 20200529205122.714-5-dbuono@linux.vnet.ibm.com Signed-off-by: Stefan Hajnoczi --- tests/check-block.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/check-block.sh b/tests/check-block.sh index ad320c21ba..8e29c868e5 100755 --- a/tests/check-block.sh +++ b/tests/check-block.sh @@ -21,7 +21,17 @@ if grep -q "CONFIG_GPROF=y" config-host.mak 2>/dev/null ; then exit 0 fi -if grep -q "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ; then +# Disable tests with any sanitizer except for SafeStack +CFLAGS=$( grep "CFLAGS.*-fsanitize" config-host.mak 2>/dev/null ) +SANITIZE_FLAGS="" +#Remove all occurrencies of -fsanitize=safe-stack +for i in ${CFLAGS}; do + if [ "${i}" != "-fsanitize=safe-stack" ]; then + SANITIZE_FLAGS="${SANITIZE_FLAGS} ${i}" + fi +done +if echo ${SANITIZE_FLAGS} | grep -q "\-fsanitize" 2>/dev/null; then + # Have a sanitize flag that is not allowed, stop echo "Sanitizers are enabled ==> Not running the qemu-iotests." exit 0 fi From 2446e0e2e9c9aaa5f8e8c7ef9a41fe8516054831 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 17 Jun 2020 14:21:55 +0100 Subject: [PATCH 06/12] block/nvme: poll queues without q->lock A lot of CPU time is spent simply locking/unlocking q->lock during polling. Check for completion outside the lock to make q->lock disappear from the profile. Signed-off-by: Stefan Hajnoczi Reviewed-by: Sergio Lopez Message-id: 20200617132201.1832152-2-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nvme.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/nvme.c b/block/nvme.c index eb2f54dd9d..e4375ec245 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -512,6 +512,18 @@ static bool nvme_poll_queues(BDRVNVMeState *s) for (i = 0; i < s->nr_queues; i++) { NVMeQueuePair *q = s->queues[i]; + const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; + NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; + + /* + * Do an early check for completions. q->lock isn't needed because + * nvme_process_completion() only runs in the event loop thread and + * cannot race with itself. + */ + if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { + continue; + } + qemu_mutex_lock(&q->lock); while (nvme_process_completion(s, q)) { /* Keep polling */ From d38253cf8b44e3b94a5b327d014ab035ae1126ed Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 17 Jun 2020 14:21:56 +0100 Subject: [PATCH 07/12] block/nvme: drop tautologous assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nvme_process_completion() explicitly checks cid so the assertion that follows is always true: if (cid == 0 || cid > NVME_QUEUE_SIZE) { ... continue; } assert(cid <= NVME_QUEUE_SIZE); Signed-off-by: Stefan Hajnoczi Reviewed-by: Sergio Lopez Reviewed-by: Philippe Mathieu-Daudé Message-id: 20200617132201.1832152-3-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nvme.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/nvme.c b/block/nvme.c index e4375ec245..d567ece3f4 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -336,7 +336,6 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) cid); continue; } - assert(cid <= NVME_QUEUE_SIZE); trace_nvme_complete_command(s, q->index, cid); preq = &q->reqs[cid - 1]; req = *preq; From 04b3fb39c815e6de67c5003e610d1cdecc911980 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 17 Jun 2020 14:21:57 +0100 Subject: [PATCH 08/12] block/nvme: don't access CQE after moving cq.head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not access a CQE after incrementing q->cq.head and releasing q->lock. It is unlikely that this causes problems in practice but it's a latent bug. The reason why it should be safe at the moment is that completion processing is not re-entrant and the CQ doorbell isn't written until the end of nvme_process_completion(). Make this change now because QEMU expects completion processing to be re-entrant and later patches will do that. Signed-off-by: Stefan Hajnoczi Reviewed-by: Sergio Lopez Reviewed-by: Philippe Mathieu-Daudé Message-id: 20200617132201.1832152-4-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nvme.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/nvme.c b/block/nvme.c index d567ece3f4..344893811a 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -321,11 +321,14 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) q->busy = true; assert(q->inflight >= 0); while (q->inflight) { + int ret; int16_t cid; + c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) { break; } + ret = nvme_translate_error(c); q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; if (!q->cq.head) { q->cq_phase = !q->cq_phase; @@ -344,7 +347,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) preq->busy = false; preq->cb = preq->opaque = NULL; qemu_mutex_unlock(&q->lock); - req.cb(req.opaque, nvme_translate_error(c)); + req.cb(req.opaque, ret); qemu_mutex_lock(&q->lock); q->inflight--; progress = true; From 1086e95da1705087db542276dcbb8ba4d55cb97f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 17 Jun 2020 14:21:58 +0100 Subject: [PATCH 09/12] block/nvme: switch to a NVMeRequest freelist There are three issues with the current NVMeRequest->busy field: 1. The busy field is accidentally accessed outside q->lock when request submission fails. 2. Waiters on free_req_queue are not woken when a request is returned early due to submission failure. 2. Finding a free request involves scanning all requests. This makes request submission O(n^2). Switch to an O(1) freelist that is always accessed under the lock. Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and NVME_NUM_REQS, the number of usable requests. This makes the code simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind that one slot is reserved. Signed-off-by: Stefan Hajnoczi Reviewed-by: Sergio Lopez Message-id: 20200617132201.1832152-5-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nvme.c | 81 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 344893811a..8e60882af3 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -33,6 +33,12 @@ #define NVME_QUEUE_SIZE 128 #define NVME_BAR_SIZE 8192 +/* + * We have to leave one slot empty as that is the full queue case where + * head == tail + 1. + */ +#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) + typedef struct { int32_t head, tail; uint8_t *queue; @@ -47,7 +53,7 @@ typedef struct { int cid; void *prp_list_page; uint64_t prp_list_iova; - bool busy; + int free_req_next; /* q->reqs[] index of next free req */ } NVMeRequest; typedef struct { @@ -61,7 +67,8 @@ typedef struct { /* Fields protected by @lock */ NVMeQueue sq, cq; int cq_phase; - NVMeRequest reqs[NVME_QUEUE_SIZE]; + int free_req_head; + NVMeRequest reqs[NVME_NUM_REQS]; bool busy; int need_kick; int inflight; @@ -200,19 +207,23 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, qemu_mutex_init(&q->lock); q->index = idx; qemu_co_queue_init(&q->free_req_queue); - q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE); + q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, - s->page_size * NVME_QUEUE_SIZE, + s->page_size * NVME_NUM_REQS, false, &prp_list_iova); if (r) { goto fail; } - for (i = 0; i < NVME_QUEUE_SIZE; i++) { + q->free_req_head = -1; + for (i = 0; i < NVME_NUM_REQS; i++) { NVMeRequest *req = &q->reqs[i]; req->cid = i + 1; + req->free_req_next = q->free_req_head; + q->free_req_head = i; req->prp_list_page = q->prp_list_pages + i * s->page_size; req->prp_list_iova = prp_list_iova + i * s->page_size; } + nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); if (local_err) { error_propagate(errp, local_err); @@ -254,13 +265,11 @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) */ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) { - int i; - NVMeRequest *req = NULL; + NVMeRequest *req; qemu_mutex_lock(&q->lock); - while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) { - /* We have to leave one slot empty as that is the full queue case (head - * == tail + 1). */ + + while (q->free_req_head == -1) { if (qemu_in_coroutine()) { trace_nvme_free_req_queue_wait(q); qemu_co_queue_wait(&q->free_req_queue, &q->lock); @@ -269,20 +278,41 @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) return NULL; } } - for (i = 0; i < NVME_QUEUE_SIZE; i++) { - if (!q->reqs[i].busy) { - q->reqs[i].busy = true; - req = &q->reqs[i]; - break; - } - } - /* We have checked inflight and need_kick while holding q->lock, so one - * free req must be available. */ - assert(req); + + req = &q->reqs[q->free_req_head]; + q->free_req_head = req->free_req_next; + req->free_req_next = -1; + qemu_mutex_unlock(&q->lock); return req; } +/* With q->lock */ +static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) +{ + req->free_req_next = q->free_req_head; + q->free_req_head = req - q->reqs; +} + +/* With q->lock */ +static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q) +{ + if (!qemu_co_queue_empty(&q->free_req_queue)) { + replay_bh_schedule_oneshot_event(s->aio_context, + nvme_free_req_queue_cb, q); + } +} + +/* Insert a request in the freelist and wake waiters */ +static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q, + NVMeRequest *req) +{ + qemu_mutex_lock(&q->lock); + nvme_put_free_req_locked(q, req); + nvme_wake_free_req_locked(s, q); + qemu_mutex_unlock(&q->lock); +} + static inline int nvme_translate_error(const NvmeCqe *c) { uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; @@ -344,7 +374,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) req = *preq; assert(req.cid == cid); assert(req.cb); - preq->busy = false; + nvme_put_free_req_locked(q, preq); preq->cb = preq->opaque = NULL; qemu_mutex_unlock(&q->lock); req.cb(req.opaque, ret); @@ -356,10 +386,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) /* Notify the device so it can post more completions. */ smp_mb_release(); *q->cq.doorbell = cpu_to_le32(q->cq.head); - if (!qemu_co_queue_empty(&q->free_req_queue)) { - replay_bh_schedule_oneshot_event(s->aio_context, - nvme_free_req_queue_cb, q); - } + nvme_wake_free_req_locked(s, q); } q->busy = false; return progress; @@ -1001,7 +1028,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); qemu_co_mutex_unlock(&s->dma_map_lock); if (r) { - req->busy = false; + nvme_put_free_req_and_wake(s, ioq, req); return r; } nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); @@ -1218,7 +1245,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, qemu_co_mutex_unlock(&s->dma_map_lock); if (ret) { - req->busy = false; + nvme_put_free_req_and_wake(s, ioq, req); goto out; } From a5db74f324ee55badfa61b03922ec24439bb94a6 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 17 Jun 2020 14:21:59 +0100 Subject: [PATCH 10/12] block/nvme: clarify that free_req_queue is protected by q->lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Existing users access free_req_queue under q->lock. Document this. Signed-off-by: Stefan Hajnoczi Reviewed-by: Sergio Lopez Reviewed-by: Philippe Mathieu-Daudé Message-id: 20200617132201.1832152-6-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/nvme.c b/block/nvme.c index 8e60882af3..426c77e5ab 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -57,7 +57,6 @@ typedef struct { } NVMeRequest; typedef struct { - CoQueue free_req_queue; QemuMutex lock; /* Fields protected by BQL */ @@ -65,6 +64,7 @@ typedef struct { uint8_t *prp_list_pages; /* Fields protected by @lock */ + CoQueue free_req_queue; NVMeQueue sq, cq; int cq_phase; int free_req_head; From b75fd5f55467307a6e367bc349a8ea6ce30d8a1c Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 17 Jun 2020 14:22:00 +0100 Subject: [PATCH 11/12] block/nvme: keep BDRVNVMeState pointer in NVMeQueuePair MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Passing around both BDRVNVMeState and NVMeQueuePair is unwieldy. Reduce the number of function arguments by keeping the BDRVNVMeState pointer in NVMeQueuePair. This will come in handly when a BH is introduced in a later patch and only one argument can be passed to it. Signed-off-by: Stefan Hajnoczi Reviewed-by: Sergio Lopez Reviewed-by: Philippe Mathieu-Daudé Message-id: 20200617132201.1832152-7-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nvme.c | 70 ++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 426c77e5ab..8dc68d3daa 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -39,6 +39,8 @@ */ #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) +typedef struct BDRVNVMeState BDRVNVMeState; + typedef struct { int32_t head, tail; uint8_t *queue; @@ -59,8 +61,11 @@ typedef struct { typedef struct { QemuMutex lock; + /* Read from I/O code path, initialized under BQL */ + BDRVNVMeState *s; + int index; + /* Fields protected by BQL */ - int index; uint8_t *prp_list_pages; /* Fields protected by @lock */ @@ -96,7 +101,7 @@ typedef volatile struct { QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000); -typedef struct { +struct BDRVNVMeState { AioContext *aio_context; QEMUVFIOState *vfio; NVMeRegs *regs; @@ -130,7 +135,7 @@ typedef struct { /* PCI address (required for nvme_refresh_filename()) */ char *device; -} BDRVNVMeState; +}; #define NVME_BLOCK_OPT_DEVICE "device" #define NVME_BLOCK_OPT_NAMESPACE "namespace" @@ -174,7 +179,7 @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, } } -static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q) +static void nvme_free_queue_pair(NVMeQueuePair *q) { qemu_vfree(q->prp_list_pages); qemu_vfree(q->sq.queue); @@ -205,6 +210,7 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, uint64_t prp_list_iova; qemu_mutex_init(&q->lock); + q->s = s; q->index = idx; qemu_co_queue_init(&q->free_req_queue); q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); @@ -240,13 +246,15 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, return q; fail: - nvme_free_queue_pair(bs, q); + nvme_free_queue_pair(q); return NULL; } /* With q->lock */ -static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) +static void nvme_kick(NVMeQueuePair *q) { + BDRVNVMeState *s = q->s; + if (s->plugged || !q->need_kick) { return; } @@ -295,21 +303,20 @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) } /* With q->lock */ -static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q) +static void nvme_wake_free_req_locked(NVMeQueuePair *q) { if (!qemu_co_queue_empty(&q->free_req_queue)) { - replay_bh_schedule_oneshot_event(s->aio_context, + replay_bh_schedule_oneshot_event(q->s->aio_context, nvme_free_req_queue_cb, q); } } /* Insert a request in the freelist and wake waiters */ -static void nvme_put_free_req_and_wake(BDRVNVMeState *s, NVMeQueuePair *q, - NVMeRequest *req) +static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req) { qemu_mutex_lock(&q->lock); nvme_put_free_req_locked(q, req); - nvme_wake_free_req_locked(s, q); + nvme_wake_free_req_locked(q); qemu_mutex_unlock(&q->lock); } @@ -336,8 +343,9 @@ static inline int nvme_translate_error(const NvmeCqe *c) } /* With q->lock */ -static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) +static bool nvme_process_completion(NVMeQueuePair *q) { + BDRVNVMeState *s = q->s; bool progress = false; NVMeRequest *preq; NVMeRequest req; @@ -386,7 +394,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) /* Notify the device so it can post more completions. */ smp_mb_release(); *q->cq.doorbell = cpu_to_le32(q->cq.head); - nvme_wake_free_req_locked(s, q); + nvme_wake_free_req_locked(q); } q->busy = false; return progress; @@ -403,8 +411,7 @@ static void nvme_trace_command(const NvmeCmd *cmd) } } -static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, - NVMeRequest *req, +static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, NvmeCmd *cmd, BlockCompletionFunc cb, void *opaque) { @@ -413,15 +420,15 @@ static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, req->opaque = opaque; cmd->cid = cpu_to_le32(req->cid); - trace_nvme_submit_command(s, q->index, req->cid); + trace_nvme_submit_command(q->s, q->index, req->cid); nvme_trace_command(cmd); qemu_mutex_lock(&q->lock); memcpy((uint8_t *)q->sq.queue + q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; q->need_kick++; - nvme_kick(s, q); - nvme_process_completion(s, q); + nvme_kick(q); + nvme_process_completion(q); qemu_mutex_unlock(&q->lock); } @@ -436,13 +443,12 @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, NvmeCmd *cmd) { NVMeRequest *req; - BDRVNVMeState *s = bs->opaque; int ret = -EINPROGRESS; req = nvme_get_free_req(q); if (!req) { return -EBUSY; } - nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret); + nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret); BDRV_POLL_WHILE(bs, ret == -EINPROGRESS); return ret; @@ -554,7 +560,7 @@ static bool nvme_poll_queues(BDRVNVMeState *s) } qemu_mutex_lock(&q->lock); - while (nvme_process_completion(s, q)) { + while (nvme_process_completion(q)) { /* Keep polling */ progress = true; } @@ -592,7 +598,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) }; if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { error_setg(errp, "Failed to create io queue [%d]", n); - nvme_free_queue_pair(bs, q); + nvme_free_queue_pair(q); return false; } cmd = (NvmeCmd) { @@ -603,7 +609,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) }; if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { error_setg(errp, "Failed to create io queue [%d]", n); - nvme_free_queue_pair(bs, q); + nvme_free_queue_pair(q); return false; } s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); @@ -798,7 +804,7 @@ static void nvme_close(BlockDriverState *bs) BDRVNVMeState *s = bs->opaque; for (i = 0; i < s->nr_queues; ++i) { - nvme_free_queue_pair(bs, s->queues[i]); + nvme_free_queue_pair(s->queues[i]); } g_free(s->queues); aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, @@ -1028,10 +1034,10 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); qemu_co_mutex_unlock(&s->dma_map_lock); if (r) { - nvme_put_free_req_and_wake(s, ioq, req); + nvme_put_free_req_and_wake(ioq, req); return r; } - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); data.co = qemu_coroutine_self(); while (data.ret == -EINPROGRESS) { @@ -1131,7 +1137,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs) assert(s->nr_queues > 1); req = nvme_get_free_req(ioq); assert(req); - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); data.co = qemu_coroutine_self(); if (data.ret == -EINPROGRESS) { @@ -1184,7 +1190,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, req = nvme_get_free_req(ioq); assert(req); - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); data.co = qemu_coroutine_self(); while (data.ret == -EINPROGRESS) { @@ -1245,13 +1251,13 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, qemu_co_mutex_unlock(&s->dma_map_lock); if (ret) { - nvme_put_free_req_and_wake(s, ioq, req); + nvme_put_free_req_and_wake(ioq, req); goto out; } trace_nvme_dsm(s, offset, bytes); - nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); + nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); data.co = qemu_coroutine_self(); while (data.ret == -EINPROGRESS) { @@ -1333,8 +1339,8 @@ static void nvme_aio_unplug(BlockDriverState *bs) for (i = 1; i < s->nr_queues; i++) { NVMeQueuePair *q = s->queues[i]; qemu_mutex_lock(&q->lock); - nvme_kick(s, q); - nvme_process_completion(s, q); + nvme_kick(q); + nvme_process_completion(q); qemu_mutex_unlock(&q->lock); } } From 7838c67f22a81fcf669785cd6c0876438422071a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 17 Jun 2020 14:22:01 +0100 Subject: [PATCH 12/12] block/nvme: support nested aio_poll() QEMU block drivers are supposed to support aio_poll() from I/O completion callback functions. This means completion processing must be re-entrant. The standard approach is to schedule a BH during completion processing and cancel it at the end of processing. If aio_poll() is invoked by a callback function then the BH will run. The BH continues the suspended completion processing. All of this means that request A's cb() can synchronously wait for request B to complete. Previously the nvme block driver would hang because it didn't process completions from nested aio_poll(). Signed-off-by: Stefan Hajnoczi Reviewed-by: Sergio Lopez Message-id: 20200617132201.1832152-8-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- block/nvme.c | 63 +++++++++++++++++++++++++++++++++++++++++----- block/trace-events | 2 +- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 8dc68d3daa..374e268915 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -74,9 +74,11 @@ typedef struct { int cq_phase; int free_req_head; NVMeRequest reqs[NVME_NUM_REQS]; - bool busy; int need_kick; int inflight; + + /* Thread-safe, no lock necessary */ + QEMUBH *completion_bh; } NVMeQueuePair; /* Memory mapped registers */ @@ -140,6 +142,8 @@ struct BDRVNVMeState { #define NVME_BLOCK_OPT_DEVICE "device" #define NVME_BLOCK_OPT_NAMESPACE "namespace" +static void nvme_process_completion_bh(void *opaque); + static QemuOptsList runtime_opts = { .name = "nvme", .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), @@ -181,6 +185,9 @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, static void nvme_free_queue_pair(NVMeQueuePair *q) { + if (q->completion_bh) { + qemu_bh_delete(q->completion_bh); + } qemu_vfree(q->prp_list_pages); qemu_vfree(q->sq.queue); qemu_vfree(q->cq.queue); @@ -214,6 +221,8 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, q->index = idx; qemu_co_queue_init(&q->free_req_queue); q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); + q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs), + nvme_process_completion_bh, q); r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, s->page_size * NVME_NUM_REQS, false, &prp_list_iova); @@ -352,11 +361,21 @@ static bool nvme_process_completion(NVMeQueuePair *q) NvmeCqe *c; trace_nvme_process_completion(s, q->index, q->inflight); - if (q->busy || s->plugged) { - trace_nvme_process_completion_queue_busy(s, q->index); + if (s->plugged) { + trace_nvme_process_completion_queue_plugged(s, q->index); return false; } - q->busy = true; + + /* + * Support re-entrancy when a request cb() function invokes aio_poll(). + * Pending completions must be visible to aio_poll() so that a cb() + * function can wait for the completion of another request. + * + * The aio_poll() loop will execute our BH and we'll resume completion + * processing there. + */ + qemu_bh_schedule(q->completion_bh); + assert(q->inflight >= 0); while (q->inflight) { int ret; @@ -384,10 +403,10 @@ static bool nvme_process_completion(NVMeQueuePair *q) assert(req.cb); nvme_put_free_req_locked(q, preq); preq->cb = preq->opaque = NULL; + q->inflight--; qemu_mutex_unlock(&q->lock); req.cb(req.opaque, ret); qemu_mutex_lock(&q->lock); - q->inflight--; progress = true; } if (progress) { @@ -396,10 +415,28 @@ static bool nvme_process_completion(NVMeQueuePair *q) *q->cq.doorbell = cpu_to_le32(q->cq.head); nvme_wake_free_req_locked(q); } - q->busy = false; + + qemu_bh_cancel(q->completion_bh); + return progress; } +static void nvme_process_completion_bh(void *opaque) +{ + NVMeQueuePair *q = opaque; + + /* + * We're being invoked because a nvme_process_completion() cb() function + * called aio_poll(). The callback may be waiting for further completions + * so notify the device that it has space to fill in more completions now. + */ + smp_mb_release(); + *q->cq.doorbell = cpu_to_le32(q->cq.head); + nvme_wake_free_req_locked(q); + + nvme_process_completion(q); +} + static void nvme_trace_command(const NvmeCmd *cmd) { int i; @@ -1309,6 +1346,13 @@ static void nvme_detach_aio_context(BlockDriverState *bs) { BDRVNVMeState *s = bs->opaque; + for (int i = 0; i < s->nr_queues; i++) { + NVMeQueuePair *q = s->queues[i]; + + qemu_bh_delete(q->completion_bh); + q->completion_bh = NULL; + } + aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, false, NULL, NULL); } @@ -1321,6 +1365,13 @@ static void nvme_attach_aio_context(BlockDriverState *bs, s->aio_context = new_context; aio_set_event_notifier(new_context, &s->irq_notifier, false, nvme_handle_event, nvme_poll_cb); + + for (int i = 0; i < s->nr_queues; i++) { + NVMeQueuePair *q = s->queues[i]; + + q->completion_bh = + aio_bh_new(new_context, nvme_process_completion_bh, q); + } } static void nvme_aio_plug(BlockDriverState *bs) diff --git a/block/trace-events b/block/trace-events index 29dff8881c..dbe76a7613 100644 --- a/block/trace-events +++ b/block/trace-events @@ -158,7 +158,7 @@ nvme_kick(void *s, int queue) "s %p queue %d" nvme_dma_flush_queue_wait(void *s) "s %p" nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d" -nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d" +nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d" nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"