From ba816ad61fdf31f59f423a773b00bfa2ed38243a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 28 Sep 2019 11:36:45 -0600 Subject: [PATCH 01/64] io_uring: run dependent links inline if possible Currently any dependent link is executed from a new workqueue context, which means that we'll be doing a context switch per link in the chain. If we are running the completion of the current request from our async workqueue and find that the next request is a link, then run it directly from the workqueue context instead of forcing another switch. This improves the performance of linked SQEs, and reduces the CPU overhead. Reviewed-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 160 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 113 insertions(+), 47 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c11c4157a4c2..5db0854fec74 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -666,7 +666,7 @@ static void __io_free_req(struct io_kiocb *req) kmem_cache_free(req_cachep, req); } -static void io_req_link_next(struct io_kiocb *req) +static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { struct io_kiocb *nxt; @@ -685,8 +685,16 @@ static void io_req_link_next(struct io_kiocb *req) } nxt->flags |= REQ_F_LINK_DONE; - INIT_WORK(&nxt->work, io_sq_wq_submit_work); - io_queue_async_work(req->ctx, nxt); + /* + * If we're in async work, we can continue processing the chain + * in this context instead of having to queue up new async work. + */ + if (nxtptr && current_work()) { + *nxtptr = nxt; + } else { + INIT_WORK(&nxt->work, io_sq_wq_submit_work); + io_queue_async_work(req->ctx, nxt); + } } } @@ -706,7 +714,7 @@ static void io_fail_links(struct io_kiocb *req) } } -static void io_free_req(struct io_kiocb *req) +static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt) { /* * If LINK is set, we have dependent requests in this chain. If we @@ -718,16 +726,39 @@ static void io_free_req(struct io_kiocb *req) if (req->flags & REQ_F_FAIL_LINK) io_fail_links(req); else - io_req_link_next(req); + io_req_link_next(req, nxt); } __io_free_req(req); } -static void io_put_req(struct io_kiocb *req) +/* + * Drop reference to request, return next in chain (if there is one) if this + * was the last reference to this request. + */ +static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) { + struct io_kiocb *nxt = NULL; + if (refcount_dec_and_test(&req->refs)) - io_free_req(req); + io_free_req(req, &nxt); + + return nxt; +} + +static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr) +{ + struct io_kiocb *nxt; + + nxt = io_put_req_find_next(req); + if (nxt) { + if (nxtptr) { + *nxtptr = nxt; + } else { + INIT_WORK(&nxt->work, io_sq_wq_submit_work); + io_queue_async_work(nxt->ctx, nxt); + } + } } static unsigned io_cqring_events(struct io_rings *rings) @@ -775,7 +806,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); } else { - io_free_req(req); + io_free_req(req, NULL); } } } @@ -947,7 +978,7 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } -static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); @@ -957,7 +988,22 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, req->user_data, res); - io_put_req(req); +} + +static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + io_complete_rw_common(kiocb, res); + io_put_req(req, NULL); +} + +static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + io_complete_rw_common(kiocb, res); + return io_put_req_find_next(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -1153,6 +1199,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } +static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, + bool in_async) +{ + if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw) + *nxt = __io_complete_rw(kiocb, ret); + else + io_rw_done(kiocb, ret); +} + static int io_import_fixed(struct io_ring_ctx *ctx, int rw, const struct io_uring_sqe *sqe, struct iov_iter *iter) @@ -1369,7 +1424,7 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, } static int io_read(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock) + struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -1418,7 +1473,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { - io_rw_done(kiocb, ret2); + kiocb_done(kiocb, ret2, nxt, s->needs_lock); } else { /* * If ->needs_lock is true, we're already in async @@ -1434,7 +1489,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, } static int io_write(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock) + struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -1492,7 +1547,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { - io_rw_done(kiocb, ret2); + kiocb_done(kiocb, ret2, nxt, s->needs_lock); } else { /* * If ->needs_lock is true, we're already in async @@ -1520,7 +1575,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data) return -EINVAL; io_cqring_add_event(ctx, user_data, err); - io_put_req(req); + io_put_req(req, NULL); return 0; } @@ -1540,7 +1595,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) } static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) + struct io_kiocb **nxt, bool force_nonblock) { loff_t sqe_off = READ_ONCE(sqe->off); loff_t sqe_len = READ_ONCE(sqe->len); @@ -1567,7 +1622,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, sqe->user_data, ret); - io_put_req(req); + io_put_req(req, nxt); return 0; } @@ -1589,6 +1644,7 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sync_file_range(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { loff_t sqe_off; @@ -1613,13 +1669,13 @@ static int io_sync_file_range(struct io_kiocb *req, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, sqe->user_data, ret); - io_put_req(req); + io_put_req(req, nxt); return 0; } #if defined(CONFIG_NET) static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock, + struct io_kiocb **nxt, bool force_nonblock, long (*fn)(struct socket *, struct user_msghdr __user *, unsigned int)) { @@ -1649,26 +1705,28 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, } io_cqring_add_event(req->ctx, sqe->user_data, ret); - io_put_req(req); + io_put_req(req, nxt); return 0; } #endif static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) + struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock); + return io_send_recvmsg(req, sqe, nxt, force_nonblock, + __sys_sendmsg_sock); #else return -EOPNOTSUPP; #endif } static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) + struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock); + return io_send_recvmsg(req, sqe, nxt, force_nonblock, + __sys_recvmsg_sock); #else return -EOPNOTSUPP; #endif @@ -1728,7 +1786,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req->ctx, sqe->user_data, ret); - io_put_req(req); + io_put_req(req, NULL); return 0; } @@ -1769,7 +1827,7 @@ static void io_poll_complete_work(struct work_struct *work) spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_put_req(req); + io_put_req(req, NULL); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -1794,7 +1852,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - io_put_req(req); + io_put_req(req, NULL); } else { io_queue_async_work(ctx, req); } @@ -1886,7 +1944,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (mask) { io_cqring_ev_posted(ctx); - io_put_req(req); + io_put_req(req, NULL); } return ipt.error; } @@ -1919,7 +1977,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) io_cqring_ev_posted(ctx); - io_put_req(req); + io_put_req(req, NULL); return HRTIMER_NORESTART; } @@ -2028,7 +2086,8 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, } static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct sqe_submit *s, bool force_nonblock) + const struct sqe_submit *s, struct io_kiocb **nxt, + bool force_nonblock) { int ret, opcode; @@ -2045,21 +2104,21 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_READV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_read(req, s, force_nonblock); + ret = io_read(req, s, nxt, force_nonblock); break; case IORING_OP_WRITEV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_write(req, s, force_nonblock); + ret = io_write(req, s, nxt, force_nonblock); break; case IORING_OP_READ_FIXED: - ret = io_read(req, s, force_nonblock); + ret = io_read(req, s, nxt, force_nonblock); break; case IORING_OP_WRITE_FIXED: - ret = io_write(req, s, force_nonblock); + ret = io_write(req, s, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, s->sqe, force_nonblock); + ret = io_fsync(req, s->sqe, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: ret = io_poll_add(req, s->sqe); @@ -2068,13 +2127,13 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_poll_remove(req, s->sqe); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, s->sqe, force_nonblock); + ret = io_sync_file_range(req, s->sqe, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, s->sqe, force_nonblock); + ret = io_sendmsg(req, s->sqe, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, s->sqe, force_nonblock); + ret = io_recvmsg(req, s->sqe, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: ret = io_timeout(req, s->sqe); @@ -2141,6 +2200,7 @@ static void io_sq_wq_submit_work(struct work_struct *work) struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; unsigned int flags = req->flags; + struct io_kiocb *nxt = NULL; /* Ensure we clear previously set non-block flag */ req->rw.ki_flags &= ~IOCB_NOWAIT; @@ -2161,7 +2221,7 @@ static void io_sq_wq_submit_work(struct work_struct *work) s->has_user = cur_mm != NULL; s->needs_lock = true; do { - ret = __io_submit_sqe(ctx, req, s, false); + ret = __io_submit_sqe(ctx, req, s, &nxt, false); /* * We can get EAGAIN for polled IO even though * we're forcing a sync submission from here, @@ -2175,16 +2235,22 @@ static void io_sq_wq_submit_work(struct work_struct *work) } /* drop submission reference */ - io_put_req(req); + io_put_req(req, NULL); if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret); - io_put_req(req); + io_put_req(req, NULL); } /* async context always use a copy of the sqe */ kfree(sqe); + /* if a dependent link is ready, do that as the next one */ + if (!ret && nxt) { + req = nxt; + continue; + } + /* req from defer and link list needn't decrease async cnt */ if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE)) goto out; @@ -2331,7 +2397,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, { int ret; - ret = __io_submit_sqe(ctx, req, s, true); + ret = __io_submit_sqe(ctx, req, s, NULL, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -2364,14 +2430,14 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, } /* drop submission reference */ - io_put_req(req); + io_put_req(req, NULL); /* and drop final reference, if we failed */ if (ret) { io_cqring_add_event(ctx, req->user_data, ret); if (req->flags & REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; - io_put_req(req); + io_put_req(req, NULL); } return ret; @@ -2385,7 +2451,7 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_req_defer(ctx, req, s->sqe); if (ret) { if (ret != -EIOCBQUEUED) { - io_free_req(req); + io_free_req(req, NULL); io_cqring_add_event(ctx, s->sqe->user_data, ret); } return 0; @@ -2412,7 +2478,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_req_defer(ctx, req, s->sqe); if (ret) { if (ret != -EIOCBQUEUED) { - io_free_req(req); + io_free_req(req, NULL); __io_free_req(shadow); io_cqring_add_event(ctx, s->sqe->user_data, ret); return 0; @@ -2460,7 +2526,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, ret = io_req_set_file(ctx, s, state, req); if (unlikely(ret)) { err_req: - io_free_req(req); + io_free_req(req, NULL); err: io_cqring_add_event(ctx, s->sqe->user_data, ret); return; From 08a451739a9b5783f67de51e84cb6d9559bb9dc4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Oct 2019 08:11:03 -0600 Subject: [PATCH 02/64] io_uring: allow sparse fixed file sets This is in preparation for allowing updates to fixed file sets without requiring a full unregister+register. Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/io_uring.c | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5db0854fec74..b85e5feb774a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2379,6 +2379,8 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, if (unlikely(!ctx->user_files || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; + if (!ctx->user_files[fd]) + return -EBADF; req->file = ctx->user_files[fd]; req->flags |= REQ_F_FIXED_FILE; } else { @@ -2999,7 +3001,8 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) int i; for (i = 0; i < ctx->nr_user_files; i++) - fput(ctx->user_files[i]); + if (ctx->user_files[i]) + fput(ctx->user_files[i]); #endif } @@ -3067,7 +3070,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) struct sock *sk = ctx->ring_sock->sk; struct scm_fp_list *fpl; struct sk_buff *skb; - int i; + int i, nr_files; if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { unsigned long inflight = ctx->user->unix_inflight + nr; @@ -3087,21 +3090,31 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) } skb->sk = sk; - skb->destructor = io_destruct_skb; + nr_files = 0; fpl->user = get_uid(ctx->user); for (i = 0; i < nr; i++) { - fpl->fp[i] = get_file(ctx->user_files[i + offset]); - unix_inflight(fpl->user, fpl->fp[i]); + if (!ctx->user_files[i + offset]) + continue; + fpl->fp[nr_files] = get_file(ctx->user_files[i + offset]); + unix_inflight(fpl->user, fpl->fp[nr_files]); + nr_files++; } - fpl->max = fpl->count = nr; - UNIXCB(skb).fp = fpl; - refcount_add(skb->truesize, &sk->sk_wmem_alloc); - skb_queue_head(&sk->sk_receive_queue, skb); + if (nr_files) { + fpl->max = SCM_MAX_FD; + fpl->count = nr_files; + UNIXCB(skb).fp = fpl; + skb->destructor = io_destruct_skb; + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + skb_queue_head(&sk->sk_receive_queue, skb); - for (i = 0; i < nr; i++) - fput(fpl->fp[i]); + for (i = 0; i < nr_files; i++) + fput(fpl->fp[i]); + } else { + kfree_skb(skb); + kfree(fpl); + } return 0; } @@ -3132,7 +3145,8 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) return 0; while (total < ctx->nr_user_files) { - fput(ctx->user_files[total]); + if (ctx->user_files[total]) + fput(ctx->user_files[total]); total++; } @@ -3163,10 +3177,15 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (!ctx->user_files) return -ENOMEM; - for (i = 0; i < nr_args; i++) { + for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { ret = -EFAULT; if (copy_from_user(&fd, &fds[i], sizeof(fd))) break; + /* allow sparse sets */ + if (fd == -1) { + ret = 0; + continue; + } ctx->user_files[i] = fget(fd); @@ -3184,13 +3203,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(ctx->user_files[i]); break; } - ctx->nr_user_files++; ret = 0; } if (ret) { for (i = 0; i < ctx->nr_user_files; i++) - fput(ctx->user_files[i]); + if (ctx->user_files[i]) + fput(ctx->user_files[i]); kfree(ctx->user_files); ctx->user_files = NULL; From c3a31e605620c279163c14068a60869ea3fda203 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Oct 2019 13:59:56 -0600 Subject: [PATCH 03/64] io_uring: add support for IORING_REGISTER_FILES_UPDATE Allows the application to remove/replace/add files to/from a file set. Passes in a struct: struct io_uring_files_update { __u32 offset; __s32 *fds; }; that holds an array of fds, size of array passed in through the usual nr_args part of the io_uring_register() system call. The logic is as follows: 1) If ->fds[i] is -1, the existing file at i + ->offset is removed from the set. 2) If ->fds[i] is a valid fd, the existing file at i + ->offset is replaced with ->fds[i]. For case #2, is the existing file is currently empty (fd == -1), the new fd is simply added to the array. Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/io_uring.c | 175 ++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 6 ++ 2 files changed, 181 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index b85e5feb774a..77774abb1074 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3224,6 +3224,178 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } +static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) +{ +#if defined(CONFIG_UNIX) + struct file *file = ctx->user_files[index]; + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(ctx->user_files[index]); +#endif +} + +static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, + int index) +{ +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head *head = &sock->sk_receive_queue; + struct sk_buff *skb; + + /* + * See if we can merge this file into an existing skb SCM_RIGHTS + * file set. If there's no room, fall back to allocating a new skb + * and filling it in. + */ + spin_lock_irq(&head->lock); + skb = skb_peek(head); + if (skb) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + + if (fpl->count < SCM_MAX_FD) { + __skb_unlink(skb, head); + spin_unlock_irq(&head->lock); + fpl->fp[fpl->count] = get_file(file); + unix_inflight(fpl->user, fpl->fp[fpl->count]); + fpl->count++; + spin_lock_irq(&head->lock); + __skb_queue_head(head, skb); + } else { + skb = NULL; + } + } + spin_unlock_irq(&head->lock); + + if (skb) { + fput(file); + return 0; + } + + return __io_sqe_files_scm(ctx, 1, index); +#else + return 0; +#endif +} + +static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_files_update up; + __s32 __user *fds; + int fd, i, err; + __u32 done; + + if (!ctx->user_files) + return -ENXIO; + if (!nr_args) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (check_add_overflow(up.offset, nr_args, &done)) + return -EOVERFLOW; + if (done > ctx->nr_user_files) + return -EINVAL; + + done = 0; + fds = (__s32 __user *) up.fds; + while (nr_args) { + err = 0; + if (copy_from_user(&fd, &fds[done], sizeof(fd))) { + err = -EFAULT; + break; + } + i = array_index_nospec(up.offset, ctx->nr_user_files); + if (ctx->user_files[i]) { + io_sqe_file_unregister(ctx, i); + ctx->user_files[i] = NULL; + } + if (fd != -1) { + struct file *file; + + file = fget(fd); + if (!file) { + err = -EBADF; + break; + } + /* + * Don't allow io_uring instances to be registered. If + * UNIX isn't enabled, then this causes a reference + * cycle and this instance can never get freed. If UNIX + * is enabled we'll handle it just fine, but there's + * still no point in allowing a ring fd as it doesn't + * support regular read/write anyway. + */ + if (file->f_op == &io_uring_fops) { + fput(file); + err = -EBADF; + break; + } + ctx->user_files[i] = file; + err = io_sqe_file_register(ctx, file, i); + if (err) + break; + } + nr_args--; + done++; + up.offset++; + } + + return done ? done : err; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { @@ -4031,6 +4203,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sqe_files_unregister(ctx); break; + case IORING_REGISTER_FILES_UPDATE: + ret = io_sqe_files_update(ctx, arg, nr_args); + break; case IORING_REGISTER_EVENTFD: ret = -EINVAL; if (nr_args != 1) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ea57526a5b89..4f532d9c0554 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -150,5 +150,11 @@ struct io_uring_params { #define IORING_UNREGISTER_FILES 3 #define IORING_REGISTER_EVENTFD 4 #define IORING_UNREGISTER_EVENTFD 5 +#define IORING_REGISTER_FILES_UPDATE 6 + +struct io_uring_files_update { + __u32 offset; + __s32 *fds; +}; #endif From 33a107f0a1b8df0ad925e39d8afc97bb78e0cec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Oct 2019 12:10:03 -0600 Subject: [PATCH 04/64] io_uring: allow application controlled CQ ring size We currently size the CQ ring as twice the SQ ring, to allow some flexibility in not overflowing the CQ ring. This is done because the SQE life time is different than that of the IO request itself, the SQE is consumed as soon as the kernel has seen the entry. Certain application don't need a huge SQ ring size, since they just submit IO in batches. But they may have a lot of requests pending, and hence need a big CQ ring to hold them all. By allowing the application to control the CQ ring size multiplier, we can cater to those applications more efficiently. If an application wants to define its own CQ ring size, it must set IORING_SETUP_CQSIZE in the setup flags, and fill out io_uring_params->cq_entries. The value must be a power of two. Signed-off-by: Jens Axboe --- fs/io_uring.c | 20 +++++++++++++++++--- include/uapi/linux/io_uring.h | 1 + 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 77774abb1074..bc93bdfe40e2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -76,6 +76,7 @@ #include "internal.h" #define IORING_MAX_ENTRIES 32768 +#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_MAX_FIXED_FILES 1024 struct io_uring { @@ -4049,10 +4050,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, * since the sqes are only used at submission time. This allows for - * some flexibility in overcommitting a bit. + * some flexibility in overcommitting a bit. If the application has + * set IORING_SETUP_CQSIZE, it will have passed in the desired number + * of CQ ring entries manually. */ p->sq_entries = roundup_pow_of_two(entries); - p->cq_entries = 2 * p->sq_entries; + if (p->flags & IORING_SETUP_CQSIZE) { + /* + * If IORING_SETUP_CQSIZE is set, we do the same roundup + * to a power-of-two, if it isn't already. We do NOT impose + * any cq vs sq ring sizing. + */ + if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES) + return -EINVAL; + p->cq_entries = roundup_pow_of_two(p->cq_entries); + } else { + p->cq_entries = 2 * p->sq_entries; + } user = get_uid(current_user()); account_mem = !capable(CAP_IPC_LOCK); @@ -4137,7 +4151,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF)) + IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE)) return -EINVAL; ret = io_uring_create(entries, &p); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4f532d9c0554..e0137ea6ad79 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -50,6 +50,7 @@ struct io_uring_sqe { #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ +#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ #define IORING_OP_NOP 0 #define IORING_OP_READV 1 From ba5290ccb6b57fc5e274ae46d051fba1f0ece262 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 9 Oct 2019 09:19:59 +0800 Subject: [PATCH 05/64] io_uring: replace s->needs_lock with s->in_async There is no function change, just to clean up the code, use s->in_async to make the code know where it is. Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bc93bdfe40e2..6bbca3d58941 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -271,7 +271,7 @@ struct sqe_submit { unsigned short index; u32 sequence; bool has_user; - bool needs_lock; + bool in_async; bool needs_fixed_file; }; @@ -1474,13 +1474,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, s->needs_lock); + kiocb_done(kiocb, ret2, nxt, s->in_async); } else { - /* - * If ->needs_lock is true, we're already in async - * context. - */ - if (!s->needs_lock) + if (!s->in_async) io_async_list_note(READ, req, iov_count); ret = -EAGAIN; } @@ -1518,8 +1514,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, ret = -EAGAIN; if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) { - /* If ->needs_lock is true, we're already in async context. */ - if (!s->needs_lock) + if (!s->in_async) io_async_list_note(WRITE, req, iov_count); goto out_free; } @@ -1548,13 +1543,9 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, nxt, s->needs_lock); + kiocb_done(kiocb, ret2, nxt, s->in_async); } else { - /* - * If ->needs_lock is true, we're already in async - * context. - */ - if (!s->needs_lock) + if (!s->in_async) io_async_list_note(WRITE, req, iov_count); ret = -EAGAIN; } @@ -2152,10 +2143,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return -EAGAIN; /* workqueue context doesn't hold uring_lock, grab it now */ - if (s->needs_lock) + if (s->in_async) mutex_lock(&ctx->uring_lock); io_iopoll_req_issued(req); - if (s->needs_lock) + if (s->in_async) mutex_unlock(&ctx->uring_lock); } @@ -2220,7 +2211,7 @@ static void io_sq_wq_submit_work(struct work_struct *work) if (!ret) { s->has_user = cur_mm != NULL; - s->needs_lock = true; + s->in_async = true; do { ret = __io_submit_sqe(ctx, req, s, &nxt, false); /* @@ -2696,7 +2687,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, -EFAULT); } else { s.has_user = has_user; - s.needs_lock = true; + s.in_async = true; s.needs_fixed_file = true; io_submit_sqe(ctx, &s, statep, &link); submitted++; @@ -2883,7 +2874,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) out: s.has_user = true; - s.needs_lock = false; + s.in_async = false; s.needs_fixed_file = false; submit++; io_submit_sqe(ctx, &s, statep, &link); From a41525ab2e75987e809926352ebc6f1397da900e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Oct 2019 16:48:15 -0600 Subject: [PATCH 06/64] io_uring: add support for absolute timeouts This is a pretty trivial addition on top of the relative timeouts we have now, but it's handy for ensuring tighter timing for those that are building scheduling primitives on top of io_uring. Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++++++----- include/uapi/linux/io_uring.h | 5 +++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6bbca3d58941..2fc6809bc3a9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1978,13 +1978,17 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned count; struct io_ring_ctx *ctx = req->ctx; struct list_head *entry; + enum hrtimer_mode mode; struct timespec64 ts; unsigned span = 0; + unsigned flags; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || - sqe->len != 1) + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1) + return -EINVAL; + flags = READ_ONCE(sqe->timeout_flags); + if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) @@ -2042,10 +2046,13 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) list_add(&req->list, entry); spin_unlock_irq(&ctx->completion_lock); - hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + if (flags & IORING_TIMEOUT_ABS) + mode = HRTIMER_MODE_ABS; + else + mode = HRTIMER_MODE_REL; + hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode); req->timeout.timer.function = io_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), - HRTIMER_MODE_REL); + hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode); return 0; } diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e0137ea6ad79..b402dfee5e15 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -70,6 +70,11 @@ struct io_uring_sqe { */ #define IORING_FSYNC_DATASYNC (1U << 0) +/* + * sqe->timeout_flags + */ +#define IORING_TIMEOUT_ABS (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ From 11365043e5271fea4c92189a976833da477a3a44 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2019 09:08:32 -0600 Subject: [PATCH 07/64] io_uring: add support for canceling timeout requests We might have cases where the need for a specific timeout is gone, add support for canceling an existing timeout operation. This works like the POLL_REMOVE command, where the application passes in the user_data of the timeout it wishes to cancel in the sqe->addr field. Signed-off-by: Jens Axboe --- fs/io_uring.c | 109 ++++++++++++++++++++++++++++------ include/uapi/linux/io_uring.h | 1 + 2 files changed, 92 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2fc6809bc3a9..e5564cd91e9c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1944,8 +1944,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { struct io_ring_ctx *ctx; - struct io_kiocb *req, *prev; + struct io_kiocb *req; unsigned long flags; + bool comp; req = container_of(timer, struct io_kiocb, timeout.timer); ctx = req->ctx; @@ -1953,24 +1954,92 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) spin_lock_irqsave(&ctx->completion_lock, flags); /* - * Adjust the reqs sequence before the current one because it - * will consume a slot in the cq_ring and the the cq_tail pointer - * will be increased, otherwise other timeout reqs may return in - * advance without waiting for enough wait_nr. + * We could be racing with timeout deletion. If the list is empty, + * then timeout lookup already found it and will be handling it. */ - prev = req; - list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list) - prev->sequence++; - list_del(&req->list); + comp = !list_empty(&req->list); + if (comp) { + struct io_kiocb *prev; - io_cqring_fill_event(ctx, req->user_data, -ETIME); - io_commit_cqring(ctx); + /* + * Adjust the reqs sequence before the current one because it + * will consume a slot in the cq_ring and the the cq_tail + * pointer will be increased, otherwise other timeout reqs may + * return in advance without waiting for enough wait_nr. + */ + prev = req; + list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list) + prev->sequence++; + + list_del_init(&req->list); + io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_commit_cqring(ctx); + } spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (comp) { + io_cqring_ev_posted(ctx); + io_put_req(req, NULL); + } + return HRTIMER_NORESTART; +} + +/* + * Remove or update an existing timeout command + */ +static int io_timeout_remove(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *treq; + int ret = -ENOENT; + __u64 user_data; + unsigned flags; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + return -EINVAL; + flags = READ_ONCE(sqe->timeout_flags); + if (flags) + return -EINVAL; + + user_data = READ_ONCE(sqe->addr); + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry(treq, &ctx->timeout_list, list) { + if (user_data == treq->user_data) { + list_del_init(&treq->list); + ret = 0; + break; + } + } + + /* didn't find timeout */ + if (ret) { +fill_ev: + io_cqring_fill_event(ctx, req->user_data, ret); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + io_put_req(req, NULL); + return 0; + } + + ret = hrtimer_try_to_cancel(&treq->timeout.timer); + if (ret == -1) { + ret = -EBUSY; + goto fill_ev; + } + + io_cqring_fill_event(ctx, req->user_data, 0); + io_cqring_fill_event(ctx, treq->user_data, -ECANCELED); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); + io_put_req(treq, NULL); io_put_req(req, NULL); - return HRTIMER_NORESTART; + return 0; } static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1994,6 +2063,13 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; + if (flags & IORING_TIMEOUT_ABS) + mode = HRTIMER_MODE_ABS; + else + mode = HRTIMER_MODE_REL; + + hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode); + /* * sqe->off holds how many events that need to occur for this * timeout event to be satisfied. @@ -2045,12 +2121,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->sequence -= span; list_add(&req->list, entry); spin_unlock_irq(&ctx->completion_lock); - - if (flags & IORING_TIMEOUT_ABS) - mode = HRTIMER_MODE_ABS; - else - mode = HRTIMER_MODE_REL; - hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode); req->timeout.timer.function = io_timeout_fn; hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode); return 0; @@ -2137,6 +2207,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_TIMEOUT: ret = io_timeout(req, s->sqe); break; + case IORING_OP_TIMEOUT_REMOVE: + ret = io_timeout_remove(req, s->sqe); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b402dfee5e15..6dc5ced1c37a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -64,6 +64,7 @@ struct io_uring_sqe { #define IORING_OP_SENDMSG 9 #define IORING_OP_RECVMSG 10 #define IORING_OP_TIMEOUT 11 +#define IORING_OP_TIMEOUT_REMOVE 12 /* * sqe->fsync_flags From c826bd7a743f275e2b68c16d595534063b400deb Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 15 Oct 2019 19:02:01 +0200 Subject: [PATCH 08/64] io_uring: add set of tracing events To trace io_uring activity one can get an information from workqueue and io trace events, but looks like some parts could be hard to identify via this approach. Making what happens inside io_uring more transparent is important to be able to reason about many aspects of it, hence introduce the set of tracing events. All such events could be roughly divided into two categories: * those, that are helping to understand correctness (from both kernel and an application point of view). E.g. a ring creation, file registration, or waiting for available CQE. Proposed approach is to get a pointer to an original structure of interest (ring context, or request), and then find relevant events. io_uring_queue_async_work also exposes a pointer to work_struct, to be able to track down corresponding workqueue events. * those, that provide performance related information. Mostly it's about events that change the flow of requests, e.g. whether an async work was queued, or delayed due to some dependencies. Another important case is how io_uring optimizations (e.g. registered files) are utilized. Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++ include/Kbuild | 1 + include/trace/events/io_uring.h | 349 ++++++++++++++++++++++++++++++++ 3 files changed, 367 insertions(+) create mode 100644 include/trace/events/io_uring.h diff --git a/fs/io_uring.c b/fs/io_uring.c index e5564cd91e9c..f83465fc1ed1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -71,6 +71,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include #include "internal.h" @@ -491,6 +494,7 @@ static inline void io_queue_async_work(struct io_ring_ctx *ctx, } } + trace_io_uring_queue_async_work(ctx, rw, req, &req->work, req->flags); queue_work(ctx->sqo_wq[rw], &req->work); } @@ -710,6 +714,7 @@ static void io_fail_links(struct io_kiocb *req) link = list_first_entry(&req->link_list, struct io_kiocb, list); list_del(&link->list); + trace_io_uring_fail_link(req, link); io_cqring_add_event(req->ctx, link->user_data, -ECANCELED); __io_free_req(link); } @@ -2149,6 +2154,7 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, req->submit.sqe = sqe_copy; INIT_WORK(&req->work, io_sq_wq_submit_work); + trace_io_uring_defer(ctx, req, false); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; @@ -2410,6 +2416,8 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) ret = false; } spin_unlock(&list->lock); + + trace_io_uring_add_to_prev(req, ret); return ret; } @@ -2458,6 +2466,7 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, } else { if (s->needs_fixed_file) return -EBADF; + trace_io_uring_file_get(ctx, fd); req->file = io_file_get(state, fd); if (unlikely(!req->file)) return -EBADF; @@ -2567,6 +2576,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, /* Insert shadow req to defer_list, blocking next IOs */ spin_lock_irq(&ctx->completion_lock); + trace_io_uring_defer(ctx, shadow, true); list_add_tail(&shadow->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); @@ -2626,6 +2636,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, s->sqe = sqe_copy; memcpy(&req->submit, s, sizeof(*s)); + trace_io_uring_link(ctx, req, prev); list_add_tail(&req->list, &prev->link_list); } else if (s->sqe->flags & IOSQE_IO_LINK) { req->flags |= REQ_F_LINK; @@ -2769,6 +2780,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, s.has_user = has_user; s.in_async = true; s.needs_fixed_file = true; + trace_io_uring_submit_sqe(ctx, true, true); io_submit_sqe(ctx, &s, statep, &link); submitted++; } @@ -2957,6 +2969,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) s.in_async = false; s.needs_fixed_file = false; submit++; + trace_io_uring_submit_sqe(ctx, true, false); io_submit_sqe(ctx, &s, statep, &link); } @@ -3039,6 +3052,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ret = 0; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + trace_io_uring_cqring_wait(ctx, min_events); do { prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); @@ -4197,6 +4211,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; p->features = IORING_FEAT_SINGLE_MMAP; + trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); @@ -4334,6 +4349,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); + trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, + ctx->cq_ev_fd != NULL, ret); out_fput: fdput(f); return ret; diff --git a/include/Kbuild b/include/Kbuild index ffba79483cc5..61b66725d259 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -1028,6 +1028,7 @@ header-test- += trace/events/fsi_master_gpio.h header-test- += trace/events/huge_memory.h header-test- += trace/events/ib_mad.h header-test- += trace/events/ib_umad.h +header-test- += trace/events/io_uring.h header-test- += trace/events/iscsi.h header-test- += trace/events/jbd2.h header-test- += trace/events/kvm.h diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h new file mode 100644 index 000000000000..c5a905fbf1da --- /dev/null +++ b/include/trace/events/io_uring.h @@ -0,0 +1,349 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM io_uring + +#if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IO_URING_H + +#include + +/** + * io_uring_create - called after a new io_uring context was prepared + * + * @fd: corresponding file descriptor + * @ctx: pointer to a ring context structure + * @sq_entries: actual SQ size + * @cq_entries: actual CQ size + * @flags: SQ ring flags, provided to io_uring_setup(2) + * + * Allows to trace io_uring creation and provide pointer to a context, that can + * be used later to find correlated events. + */ +TRACE_EVENT(io_uring_create, + + TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags), + + TP_ARGS(fd, ctx, sq_entries, cq_entries, flags), + + TP_STRUCT__entry ( + __field( int, fd ) + __field( void *, ctx ) + __field( u32, sq_entries ) + __field( u32, cq_entries ) + __field( u32, flags ) + ), + + TP_fast_assign( + __entry->fd = fd; + __entry->ctx = ctx; + __entry->sq_entries = sq_entries; + __entry->cq_entries = cq_entries; + __entry->flags = flags; + ), + + TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d", + __entry->ctx, __entry->fd, __entry->sq_entries, + __entry->cq_entries, __entry->flags) +); + +/** + * io_uring_register - called after a buffer/file/eventfd was succesfully + * registered for a ring + * + * @ctx: pointer to a ring context structure + * @opcode: describes which operation to perform + * @nr_user_files: number of registered files + * @nr_user_bufs: number of registered buffers + * @cq_ev_fd: whether eventfs registered or not + * @ret: return code + * + * Allows to trace fixed files/buffers/eventfds, that could be registered to + * avoid an overhead of getting references to them for every operation. This + * event, together with io_uring_file_get, can provide a full picture of how + * much overhead one can reduce via fixing. + */ +TRACE_EVENT(io_uring_register, + + TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files, + unsigned nr_bufs, bool eventfd, long ret), + + TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( unsigned, opcode ) + __field( unsigned, nr_files ) + __field( unsigned, nr_bufs ) + __field( bool, eventfd ) + __field( long, ret ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->nr_files = nr_files; + __entry->nr_bufs = nr_bufs; + __entry->eventfd = eventfd; + __entry->ret = ret; + ), + + TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, " + "eventfd %d, ret %ld", + __entry->ctx, __entry->opcode, __entry->nr_files, + __entry->nr_bufs, __entry->eventfd, __entry->ret) +); + +/** + * io_uring_file_get - called before getting references to an SQE file + * + * @ctx: pointer to a ring context structure + * @fd: SQE file descriptor + * + * Allows to trace out how often an SQE file reference is obtained, which can + * help figuring out if it makes sense to use fixed files, or check that fixed + * files are used correctly. + */ +TRACE_EVENT(io_uring_file_get, + + TP_PROTO(void *ctx, int fd), + + TP_ARGS(ctx, fd), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( int, fd ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->fd = fd; + ), + + TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd) +); + +/** + * io_uring_queue_async_work - called before submitting a new async work + * + * @ctx: pointer to a ring context structure + * @rw: type of workqueue, normal or buffered writes + * @req: pointer to a submitted request + * @work: pointer to a submitted work_struct + * + * Allows to trace asynchronous work submission. + */ +TRACE_EVENT(io_uring_queue_async_work, + + TP_PROTO(void *ctx, int rw, void * req, struct work_struct *work, + unsigned int flags), + + TP_ARGS(ctx, rw, req, work, flags), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( int, rw ) + __field( void *, req ) + __field( struct work_struct *, work ) + __field( unsigned int, flags ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->rw = rw; + __entry->req = req; + __entry->work = work; + __entry->flags = flags; + ), + + TP_printk("ring %p, request %p, flags %d, %s queue, work %p", + __entry->ctx, __entry->req, __entry->flags, + __entry->rw ? "buffered" : "normal", __entry->work) +); + +/** + * io_uring_defer_list - called before the io_uring work added into defer_list + * + * @ctx: pointer to a ring context structure + * @req: pointer to a deferred request + * @shadow: whether request is shadow or not + * + * Allows to track deferred requests, to get an insight about what requests are + * not started immediately. + */ +TRACE_EVENT(io_uring_defer, + + TP_PROTO(void *ctx, void *req, bool shadow), + + TP_ARGS(ctx, req, shadow), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( void *, req ) + __field( bool, shadow ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->req = req; + __entry->shadow = shadow; + ), + + TP_printk("ring %p, request %p%s", __entry->ctx, __entry->req, + __entry->shadow ? ", shadow": "") +); + +/** + * io_uring_link - called before the io_uring request added into link_list of + * another request + * + * @ctx: pointer to a ring context structure + * @req: pointer to a linked request + * @target_req: pointer to a previous request, that would contain @req + * + * Allows to track linked requests, to understand dependencies between requests + * and how does it influence their execution flow. + */ +TRACE_EVENT(io_uring_link, + + TP_PROTO(void *ctx, void *req, void *target_req), + + TP_ARGS(ctx, req, target_req), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( void *, req ) + __field( void *, target_req ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->req = req; + __entry->target_req = target_req; + ), + + TP_printk("ring %p, request %p linked after %p", + __entry->ctx, __entry->req, __entry->target_req) +); + +/** + * io_uring_add_to_prev - called after a request was added into a previously + * submitted work + * + * @req: pointer to a request, added to a previous + * @ret: whether or not it was completed successfully + * + * Allows to track merged work, to figure out how often requests are piggy + * backed into other ones, changing the execution flow. + */ +TRACE_EVENT(io_uring_add_to_prev, + + TP_PROTO(void *req, bool ret), + + TP_ARGS(req, ret), + + TP_STRUCT__entry ( + __field( void *, req ) + __field( bool, ret ) + ), + + TP_fast_assign( + __entry->req = req; + __entry->ret = ret; + ), + + TP_printk("request %p, ret %d", __entry->req, __entry->ret) +); + +/** + * io_uring_cqring_wait - called before start waiting for an available CQE + * + * @ctx: pointer to a ring context structure + * @min_events: minimal number of events to wait for + * + * Allows to track waiting for CQE, so that we can e.g. troubleshoot + * situations, when an application wants to wait for an event, that never + * comes. + */ +TRACE_EVENT(io_uring_cqring_wait, + + TP_PROTO(void *ctx, int min_events), + + TP_ARGS(ctx, min_events), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( int, min_events ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->min_events = min_events; + ), + + TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events) +); + +/** + * io_uring_fail_link - called before failing a linked request + * + * @req: request, which links were cancelled + * @link: cancelled link + * + * Allows to track linked requests cancellation, to see not only that some work + * was cancelled, but also which request was the reason. + */ +TRACE_EVENT(io_uring_fail_link, + + TP_PROTO(void *req, void *link), + + TP_ARGS(req, link), + + TP_STRUCT__entry ( + __field( void *, req ) + __field( void *, link ) + ), + + TP_fast_assign( + __entry->req = req; + __entry->link = link; + ), + + TP_printk("request %p, link %p", __entry->req, __entry->link) +); + +/** + * io_uring_submit_sqe - called before submitting one SQE + * + * @ctx: pointer to a ring context structure + * @force_nonblock: whether a context blocking or not + * @sq_thread: true if sq_thread has submitted this SQE + * + * Allows to track SQE submitting, to understand what was the source of it, SQ + * thread or io_uring_enter call. + */ +TRACE_EVENT(io_uring_submit_sqe, + + TP_PROTO(void *ctx, bool force_nonblock, bool sq_thread), + + TP_ARGS(ctx, force_nonblock, sq_thread), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( bool, force_nonblock ) + __field( bool, sq_thread ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->force_nonblock = force_nonblock; + __entry->sq_thread = sq_thread; + ), + + TP_printk("ring %p, non block %d, sq_thread %d", + __entry->ctx, __entry->force_nonblock, __entry->sq_thread) +); + +#endif /* _TRACE_IO_URING_H */ + +/* This part must be outside protection */ +#include From fa4562280889ad372dfb1413833a8b8675721b17 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 27 Oct 2019 18:52:20 +0300 Subject: [PATCH 09/64] io_uring: remove index from sqe_submit submit->index is used only for inbound check in submission path (i.e. head < ctx->sq_entries). However, it always will be true, as 1. it's already validated by io_get_sqring() 2. ctx->sq_entries can't be changedd in between, because of held ctx->uring_lock and ctx->refs. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f83465fc1ed1..545efd89a1f9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -271,7 +271,6 @@ struct io_ring_ctx { struct sqe_submit { const struct io_uring_sqe *sqe; - unsigned short index; u32 sequence; bool has_user; bool in_async; @@ -2168,9 +2167,6 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, req->user_data = READ_ONCE(s->sqe->user_data); - if (unlikely(s->index >= ctx->sq_entries)) - return -EINVAL; - opcode = READ_ONCE(s->sqe->opcode); switch (opcode) { case IORING_OP_NOP: @@ -2716,7 +2712,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) head = READ_ONCE(sq_array[head & ctx->sq_mask]); if (head < ctx->sq_entries) { - s->index = head; s->sqe = &ctx->sq_sqes[head]; s->sequence = ctx->cached_sq_head; ctx->cached_sq_head++; From 95a1b3ff9a3e4ea2f26c4e802067d58831f415db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 27 Oct 2019 23:15:41 +0300 Subject: [PATCH 10/64] io_uring: Fix mm_fault with READ/WRITE_FIXED Commit fb5ccc98782f ("io_uring: Fix broken links with offloading") introduced a potential performance regression with unconditionally taking mm even for READ/WRITE_FIXED operations. Return the logic handling it back. mm-faulted requests will go through the generic submission path, so honoring links and drains, but will fail further on req->has_user check. Fixes: fb5ccc98782f ("io_uring: Fix broken links with offloading") Cc: stable@vger.kernel.org # v5.4 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 545efd89a1f9..f9eff8f62ddb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2726,13 +2726,14 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - bool has_user, bool mm_fault) + struct mm_struct **mm) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; struct io_kiocb *shadow_req = NULL; bool prev_was_link = false; int i, submitted = 0; + bool mm_fault = false; if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, nr); @@ -2745,6 +2746,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!io_get_sqring(ctx, &s)) break; + if (io_sqe_needs_user(s.sqe) && !*mm) { + mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); + if (!mm_fault) { + use_mm(ctx->sqo_mm); + *mm = ctx->sqo_mm; + } + } + /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. @@ -2768,17 +2777,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } out: - if (unlikely(mm_fault)) { - io_cqring_add_event(ctx, s.sqe->user_data, - -EFAULT); - } else { - s.has_user = has_user; - s.in_async = true; - s.needs_fixed_file = true; - trace_io_uring_submit_sqe(ctx, true, true); - io_submit_sqe(ctx, &s, statep, &link); - submitted++; - } + s.has_user = *mm != NULL; + s.in_async = true; + s.needs_fixed_file = true; + trace_io_uring_submit_sqe(ctx, true, true); + io_submit_sqe(ctx, &s, statep, &link); + submitted++; } if (link) @@ -2805,7 +2809,6 @@ static int io_sq_thread(void *data) timeout = inflight = 0; while (!kthread_should_park()) { - bool mm_fault = false; unsigned int to_submit; if (inflight) { @@ -2890,18 +2893,8 @@ static int io_sq_thread(void *data) ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - /* Unless all new commands are FIXED regions, grab mm */ - if (!cur_mm) { - mm_fault = !mmget_not_zero(ctx->sqo_mm); - if (!mm_fault) { - use_mm(ctx->sqo_mm); - cur_mm = ctx->sqo_mm; - } - } - to_submit = min(to_submit, ctx->sq_entries); - inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL, - mm_fault); + inflight += io_submit_sqes(ctx, to_submit, &cur_mm); /* Commit SQ ring head once we've consumed all SQEs */ io_commit_sqring(ctx); From 771b53d033e8663abdf59704806aa856b236dcdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Oct 2019 10:25:58 -0600 Subject: [PATCH 11/64] io-wq: small threadpool implementation for io_uring This adds support for io-wq, a smaller and specialized thread pool implementation. This is meant to replace workqueues for io_uring. Among the reasons for this addition are: - We can assign memory context smarter and more persistently if we manage the life time of threads. - We can drop various work-arounds we have in io_uring, like the async_list. - We can implement hashed work insertion, to manage concurrency of buffered writes without needing a) an extra workqueue, or b) needlessly making the concurrency of said workqueue very low which hurts performance of multiple buffered file writers. - We can implement cancel through signals, for cancelling interruptible work like read/write (or send/recv) to/from sockets. - We need the above cancel for being able to assign and use file tables from a process. - We can implement a more thorough cancel operation in general. - We need it to move towards a syslet/threadlet model for even faster async execution. For that we need to take ownership of the used threads. This list is just off the top of my head. Performance should be the same, or better, at least that's what I've seen in my testing. io-wq supports basic NUMA functionality, setting up a pool per node. io-wq hooks up to the scheduler schedule in/out just like workqueue and uses that to drive the need for more/less workers. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jens Axboe --- fs/Kconfig | 3 + fs/Makefile | 1 + fs/io-wq.c | 825 ++++++++++++++++++++++++++++++++++++++++++ fs/io-wq.h | 55 +++ include/linux/sched.h | 1 + kernel/sched/core.c | 16 +- 6 files changed, 897 insertions(+), 4 deletions(-) create mode 100644 fs/io-wq.c create mode 100644 fs/io-wq.h diff --git a/fs/Kconfig b/fs/Kconfig index 2501e6f1f965..7b623e9fc1b0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -322,4 +322,7 @@ source "fs/nls/Kconfig" source "fs/dlm/Kconfig" source "fs/unicode/Kconfig" +config IO_WQ + bool + endmenu diff --git a/fs/Makefile b/fs/Makefile index 14231b4cf383..1148c555c4d3 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_IO_URING) += io_uring.o +obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ diff --git a/fs/io-wq.c b/fs/io-wq.c new file mode 100644 index 000000000000..37863879e987 --- /dev/null +++ b/fs/io-wq.c @@ -0,0 +1,825 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Basic worker thread pool for io_uring + * + * Copyright (C) 2019 Jens Axboe + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "io-wq.h" + +#define WORKER_IDLE_TIMEOUT (5 * HZ) + +enum { + IO_WORKER_F_UP = 1, /* up and active */ + IO_WORKER_F_RUNNING = 2, /* account as running */ + IO_WORKER_F_FREE = 4, /* worker on free list */ + IO_WORKER_F_EXITING = 8, /* worker exiting */ + IO_WORKER_F_FIXED = 16, /* static idle worker */ +}; + +enum { + IO_WQ_BIT_EXIT = 0, /* wq exiting */ + IO_WQ_BIT_CANCEL = 1, /* cancel work on list */ +}; + +enum { + IO_WQE_FLAG_STALLED = 1, /* stalled on hash */ +}; + +/* + * One for each thread in a wqe pool + */ +struct io_worker { + refcount_t ref; + unsigned flags; + struct hlist_nulls_node nulls_node; + struct task_struct *task; + wait_queue_head_t wait; + struct io_wqe *wqe; + struct io_wq_work *cur_work; + + struct rcu_head rcu; + struct mm_struct *mm; +}; + +struct io_wq_nulls_list { + struct hlist_nulls_head head; + unsigned long nulls; +}; + +#if BITS_PER_LONG == 64 +#define IO_WQ_HASH_ORDER 6 +#else +#define IO_WQ_HASH_ORDER 5 +#endif + +/* + * Per-node worker thread pool + */ +struct io_wqe { + struct { + spinlock_t lock; + struct list_head work_list; + unsigned long hash_map; + unsigned flags; + } ____cacheline_aligned_in_smp; + + int node; + unsigned nr_workers; + unsigned max_workers; + atomic_t nr_running; + + struct io_wq_nulls_list free_list; + struct io_wq_nulls_list busy_list; + + struct io_wq *wq; +}; + +/* + * Per io_wq state + */ +struct io_wq { + struct io_wqe **wqes; + unsigned long state; + unsigned nr_wqes; + + struct task_struct *manager; + struct mm_struct *mm; + refcount_t refs; + struct completion done; +}; + +static void io_wq_free_worker(struct rcu_head *head) +{ + struct io_worker *worker = container_of(head, struct io_worker, rcu); + + kfree(worker); +} + +static bool io_worker_get(struct io_worker *worker) +{ + return refcount_inc_not_zero(&worker->ref); +} + +static void io_worker_release(struct io_worker *worker) +{ + if (refcount_dec_and_test(&worker->ref)) + wake_up_process(worker->task); +} + +/* + * Note: drops the wqe->lock if returning true! The caller must re-acquire + * the lock in that case. Some callers need to restart handling if this + * happens, so we can't just re-acquire the lock on behalf of the caller. + */ +static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) +{ + /* + * If we have an active mm, we need to drop the wq lock before unusing + * it. If we do, return true and let the caller retry the idle loop. + */ + if (worker->mm) { + __acquire(&wqe->lock); + spin_unlock_irq(&wqe->lock); + __set_current_state(TASK_RUNNING); + set_fs(KERNEL_DS); + unuse_mm(worker->mm); + mmput(worker->mm); + worker->mm = NULL; + return true; + } + + return false; +} + +static void io_worker_exit(struct io_worker *worker) +{ + struct io_wqe *wqe = worker->wqe; + bool all_done = false; + + /* + * If we're not at zero, someone else is holding a brief reference + * to the worker. Wait for that to go away. + */ + set_current_state(TASK_INTERRUPTIBLE); + if (!refcount_dec_and_test(&worker->ref)) + schedule(); + __set_current_state(TASK_RUNNING); + + preempt_disable(); + current->flags &= ~PF_IO_WORKER; + if (worker->flags & IO_WORKER_F_RUNNING) + atomic_dec(&wqe->nr_running); + worker->flags = 0; + preempt_enable(); + + spin_lock_irq(&wqe->lock); + hlist_nulls_del_rcu(&worker->nulls_node); + if (__io_worker_unuse(wqe, worker)) { + __release(&wqe->lock); + spin_lock_irq(&wqe->lock); + } + wqe->nr_workers--; + all_done = !wqe->nr_workers; + spin_unlock_irq(&wqe->lock); + + /* all workers gone, wq exit can proceed */ + if (all_done && refcount_dec_and_test(&wqe->wq->refs)) + complete(&wqe->wq->done); + + call_rcu(&worker->rcu, io_wq_free_worker); +} + +static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) +{ + allow_kernel_signal(SIGINT); + + current->flags |= PF_IO_WORKER; + + worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + atomic_inc(&wqe->nr_running); +} + +/* + * Worker will start processing some work. Move it to the busy list, if + * it's currently on the freelist + */ +static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, + struct io_wq_work *work) + __must_hold(wqe->lock) +{ + if (worker->flags & IO_WORKER_F_FREE) { + worker->flags &= ~IO_WORKER_F_FREE; + hlist_nulls_del_init_rcu(&worker->nulls_node); + hlist_nulls_add_head_rcu(&worker->nulls_node, + &wqe->busy_list.head); + } + worker->cur_work = work; +} + +/* + * No work, worker going to sleep. Move to freelist, and unuse mm if we + * have one attached. Dropping the mm may potentially sleep, so we drop + * the lock in that case and return success. Since the caller has to + * retry the loop in that case (we changed task state), we don't regrab + * the lock if we return success. + */ +static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) + __must_hold(wqe->lock) +{ + if (!(worker->flags & IO_WORKER_F_FREE)) { + worker->flags |= IO_WORKER_F_FREE; + hlist_nulls_del_init_rcu(&worker->nulls_node); + hlist_nulls_add_head_rcu(&worker->nulls_node, + &wqe->free_list.head); + } + + return __io_worker_unuse(wqe, worker); +} + +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) + __must_hold(wqe->lock) +{ + struct io_wq_work *work; + + list_for_each_entry(work, &wqe->work_list, list) { + /* not hashed, can run anytime */ + if (!(work->flags & IO_WQ_WORK_HASHED)) { + list_del(&work->list); + return work; + } + + /* hashed, can run if not already running */ + *hash = work->flags >> IO_WQ_HASH_SHIFT; + if (!(wqe->hash_map & BIT_ULL(*hash))) { + wqe->hash_map |= BIT_ULL(*hash); + list_del(&work->list); + return work; + } + } + + return NULL; +} + +static void io_worker_handle_work(struct io_worker *worker) + __releases(wqe->lock) +{ + struct io_wq_work *work, *old_work; + struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; + + do { + unsigned hash = -1U; + + /* + * Signals are either sent to cancel specific work, or to just + * cancel all work items. For the former, ->cur_work must + * match. ->cur_work is NULL at this point, since we haven't + * assigned any work, so it's safe to flush signals for that + * case. For the latter case of cancelling all work, the caller + * wil have set IO_WQ_BIT_CANCEL. + */ + if (signal_pending(current)) + flush_signals(current); + + /* + * If we got some work, mark us as busy. If we didn't, but + * the list isn't empty, it means we stalled on hashed work. + * Mark us stalled so we don't keep looking for work when we + * can't make progress, any work completion or insertion will + * clear the stalled flag. + */ + work = io_get_next_work(wqe, &hash); + if (work) + __io_worker_busy(wqe, worker, work); + else if (!list_empty(&wqe->work_list)) + wqe->flags |= IO_WQE_FLAG_STALLED; + + spin_unlock_irq(&wqe->lock); + if (!work) + break; +next: + if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && + wq->mm && mmget_not_zero(wq->mm)) { + use_mm(wq->mm); + set_fs(USER_DS); + worker->mm = wq->mm; + } + if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) + work->flags |= IO_WQ_WORK_CANCEL; + if (worker->mm) + work->flags |= IO_WQ_WORK_HAS_MM; + + old_work = work; + work->func(&work); + + spin_lock_irq(&wqe->lock); + worker->cur_work = NULL; + if (hash != -1U) { + wqe->hash_map &= ~BIT_ULL(hash); + wqe->flags &= ~IO_WQE_FLAG_STALLED; + } + if (work && work != old_work) { + spin_unlock_irq(&wqe->lock); + /* dependent work not hashed */ + hash = -1U; + goto next; + } + } while (1); +} + +static inline bool io_wqe_run_queue(struct io_wqe *wqe) + __must_hold(wqe->lock) +{ + if (!list_empty_careful(&wqe->work_list) && + !(wqe->flags & IO_WQE_FLAG_STALLED)) + return true; + return false; +} + +static int io_wqe_worker(void *data) +{ + struct io_worker *worker = data; + struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; + DEFINE_WAIT(wait); + + io_worker_start(wqe, worker); + + while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { + prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE); + + spin_lock_irq(&wqe->lock); + if (io_wqe_run_queue(wqe)) { + __set_current_state(TASK_RUNNING); + io_worker_handle_work(worker); + continue; + } + /* drops the lock on success, retry */ + if (__io_worker_idle(wqe, worker)) { + __release(&wqe->lock); + continue; + } + spin_unlock_irq(&wqe->lock); + if (signal_pending(current)) + flush_signals(current); + if (schedule_timeout(WORKER_IDLE_TIMEOUT)) + continue; + /* timed out, exit unless we're the fixed worker */ + if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || + !(worker->flags & IO_WORKER_F_FIXED)) + break; + } + + finish_wait(&worker->wait, &wait); + + if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { + spin_lock_irq(&wqe->lock); + if (!list_empty(&wqe->work_list)) + io_worker_handle_work(worker); + else + spin_unlock_irq(&wqe->lock); + } + + io_worker_exit(worker); + return 0; +} + +/* + * Check head of free list for an available worker. If one isn't available, + * caller must wake up the wq manager to create one. + */ +static bool io_wqe_activate_free_worker(struct io_wqe *wqe) + __must_hold(RCU) +{ + struct hlist_nulls_node *n; + struct io_worker *worker; + + n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list.head)); + if (is_a_nulls(n)) + return false; + + worker = hlist_nulls_entry(n, struct io_worker, nulls_node); + if (io_worker_get(worker)) { + wake_up(&worker->wait); + io_worker_release(worker); + return true; + } + + return false; +} + +/* + * We need a worker. If we find a free one, we're good. If not, and we're + * below the max number of workers, wake up the manager to create one. + */ +static void io_wqe_wake_worker(struct io_wqe *wqe) +{ + bool ret; + + rcu_read_lock(); + ret = io_wqe_activate_free_worker(wqe); + rcu_read_unlock(); + + if (!ret && wqe->nr_workers < wqe->max_workers) + wake_up_process(wqe->wq->manager); +} + +/* + * Called when a worker is scheduled in. Mark us as currently running. + */ +void io_wq_worker_running(struct task_struct *tsk) +{ + struct io_worker *worker = kthread_data(tsk); + struct io_wqe *wqe = worker->wqe; + + if (!(worker->flags & IO_WORKER_F_UP)) + return; + if (worker->flags & IO_WORKER_F_RUNNING) + return; + worker->flags |= IO_WORKER_F_RUNNING; + atomic_inc(&wqe->nr_running); +} + +/* + * Called when worker is going to sleep. If there are no workers currently + * running and we have work pending, wake up a free one or have the manager + * set one up. + */ +void io_wq_worker_sleeping(struct task_struct *tsk) +{ + struct io_worker *worker = kthread_data(tsk); + struct io_wqe *wqe = worker->wqe; + + if (!(worker->flags & IO_WORKER_F_UP)) + return; + if (!(worker->flags & IO_WORKER_F_RUNNING)) + return; + + worker->flags &= ~IO_WORKER_F_RUNNING; + + spin_lock_irq(&wqe->lock); + if (atomic_dec_and_test(&wqe->nr_running) && io_wqe_run_queue(wqe)) + io_wqe_wake_worker(wqe); + spin_unlock_irq(&wqe->lock); +} + +static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe) +{ + struct io_worker *worker; + + worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node); + if (!worker) + return; + + refcount_set(&worker->ref, 1); + worker->nulls_node.pprev = NULL; + init_waitqueue_head(&worker->wait); + worker->wqe = wqe; + + worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node, + "io_wqe_worker-%d", wqe->node); + if (IS_ERR(worker->task)) { + kfree(worker); + return; + } + + spin_lock_irq(&wqe->lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list.head); + worker->flags |= IO_WORKER_F_FREE; + if (!wqe->nr_workers) + worker->flags |= IO_WORKER_F_FIXED; + wqe->nr_workers++; + spin_unlock_irq(&wqe->lock); + + wake_up_process(worker->task); +} + +static inline bool io_wqe_need_new_worker(struct io_wqe *wqe) + __must_hold(wqe->lock) +{ + if (!wqe->nr_workers) + return true; + if (hlist_nulls_empty(&wqe->free_list.head) && + wqe->nr_workers < wqe->max_workers && io_wqe_run_queue(wqe)) + return true; + + return false; +} + +/* + * Manager thread. Tasked with creating new workers, if we need them. + */ +static int io_wq_manager(void *data) +{ + struct io_wq *wq = data; + + while (!kthread_should_stop()) { + int i; + + for (i = 0; i < wq->nr_wqes; i++) { + struct io_wqe *wqe = wq->wqes[i]; + bool fork_worker = false; + + spin_lock_irq(&wqe->lock); + fork_worker = io_wqe_need_new_worker(wqe); + spin_unlock_irq(&wqe->lock); + if (fork_worker) + create_io_worker(wq, wqe); + } + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } + + return 0; +} + +static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) +{ + unsigned long flags; + + spin_lock_irqsave(&wqe->lock, flags); + list_add_tail(&work->list, &wqe->work_list); + wqe->flags &= ~IO_WQE_FLAG_STALLED; + spin_unlock_irqrestore(&wqe->lock, flags); + + if (!atomic_read(&wqe->nr_running)) + io_wqe_wake_worker(wqe); +} + +void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) +{ + struct io_wqe *wqe = wq->wqes[numa_node_id()]; + + io_wqe_enqueue(wqe, work); +} + +/* + * Enqueue work, hashed by some key. Work items that hash to the same value + * will not be done in parallel. Used to limit concurrent writes, generally + * hashed by inode. + */ +void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val) +{ + struct io_wqe *wqe = wq->wqes[numa_node_id()]; + unsigned bit; + + + bit = hash_ptr(val, IO_WQ_HASH_ORDER); + work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + io_wqe_enqueue(wqe, work); +} + +static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) +{ + send_sig(SIGINT, worker->task, 1); + return false; +} + +/* + * Iterate the passed in list and call the specific function for each + * worker that isn't exiting + */ +static bool io_wq_for_each_worker(struct io_wqe *wqe, + struct io_wq_nulls_list *list, + bool (*func)(struct io_worker *, void *), + void *data) +{ + struct hlist_nulls_node *n; + struct io_worker *worker; + bool ret = false; + +restart: + hlist_nulls_for_each_entry_rcu(worker, n, &list->head, nulls_node) { + if (io_worker_get(worker)) { + ret = func(worker, data); + io_worker_release(worker); + if (ret) + break; + } + } + if (!ret && get_nulls_value(n) != list->nulls) + goto restart; + return ret; +} + +void io_wq_cancel_all(struct io_wq *wq) +{ + int i; + + set_bit(IO_WQ_BIT_CANCEL, &wq->state); + + /* + * Browse both lists, as there's a gap between handing work off + * to a worker and the worker putting itself on the busy_list + */ + rcu_read_lock(); + for (i = 0; i < wq->nr_wqes; i++) { + struct io_wqe *wqe = wq->wqes[i]; + + io_wq_for_each_worker(wqe, &wqe->busy_list, + io_wqe_worker_send_sig, NULL); + io_wq_for_each_worker(wqe, &wqe->free_list, + io_wqe_worker_send_sig, NULL); + } + rcu_read_unlock(); +} + +static bool io_wq_worker_cancel(struct io_worker *worker, void *data) +{ + struct io_wq_work *work = data; + + if (worker->cur_work == work) { + send_sig(SIGINT, worker->task, 1); + return true; + } + + return false; +} + +static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, + struct io_wq_work *cwork) +{ + struct io_wq_work *work; + bool found = false; + + cwork->flags |= IO_WQ_WORK_CANCEL; + + /* + * First check pending list, if we're lucky we can just remove it + * from there. CANCEL_OK means that the work is returned as-new, + * no completion will be posted for it. + */ + spin_lock_irq(&wqe->lock); + list_for_each_entry(work, &wqe->work_list, list) { + if (work == cwork) { + list_del(&work->list); + found = true; + break; + } + } + spin_unlock_irq(&wqe->lock); + + if (found) { + work->flags |= IO_WQ_WORK_CANCEL; + work->func(&work); + return IO_WQ_CANCEL_OK; + } + + /* + * Now check if a free (going busy) or busy worker has the work + * currently running. If we find it there, we'll return CANCEL_RUNNING + * as an indication that we attempte to signal cancellation. The + * completion will run normally in this case. + */ + rcu_read_lock(); + found = io_wq_for_each_worker(wqe, &wqe->free_list, io_wq_worker_cancel, + cwork); + if (found) + goto done; + + found = io_wq_for_each_worker(wqe, &wqe->busy_list, io_wq_worker_cancel, + cwork); +done: + rcu_read_unlock(); + return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; +} + +enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) +{ + enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; + int i; + + for (i = 0; i < wq->nr_wqes; i++) { + struct io_wqe *wqe = wq->wqes[i]; + + ret = io_wqe_cancel_work(wqe, cwork); + if (ret != IO_WQ_CANCEL_NOTFOUND) + break; + } + + return ret; +} + +struct io_wq_flush_data { + struct io_wq_work work; + struct completion done; +}; + +static void io_wq_flush_func(struct io_wq_work **workptr) +{ + struct io_wq_work *work = *workptr; + struct io_wq_flush_data *data; + + data = container_of(work, struct io_wq_flush_data, work); + complete(&data->done); +} + +/* + * Doesn't wait for previously queued work to finish. When this completes, + * it just means that previously queued work was started. + */ +void io_wq_flush(struct io_wq *wq) +{ + struct io_wq_flush_data data; + int i; + + for (i = 0; i < wq->nr_wqes; i++) { + struct io_wqe *wqe = wq->wqes[i]; + + init_completion(&data.done); + INIT_IO_WORK(&data.work, io_wq_flush_func); + io_wqe_enqueue(wqe, &data.work); + wait_for_completion(&data.done); + } +} + +struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm) +{ + int ret = -ENOMEM, i, node; + struct io_wq *wq; + + wq = kcalloc(1, sizeof(*wq), GFP_KERNEL); + if (!wq) + return ERR_PTR(-ENOMEM); + + wq->nr_wqes = num_online_nodes(); + wq->wqes = kcalloc(wq->nr_wqes, sizeof(struct io_wqe *), GFP_KERNEL); + if (!wq->wqes) { + kfree(wq); + return ERR_PTR(-ENOMEM); + } + + i = 0; + refcount_set(&wq->refs, wq->nr_wqes); + for_each_online_node(node) { + struct io_wqe *wqe; + + wqe = kcalloc_node(1, sizeof(struct io_wqe), GFP_KERNEL, node); + if (!wqe) + break; + wq->wqes[i] = wqe; + wqe->node = node; + wqe->max_workers = concurrency; + wqe->node = node; + wqe->wq = wq; + spin_lock_init(&wqe->lock); + INIT_LIST_HEAD(&wqe->work_list); + INIT_HLIST_NULLS_HEAD(&wqe->free_list.head, 0); + wqe->free_list.nulls = 0; + INIT_HLIST_NULLS_HEAD(&wqe->busy_list.head, 1); + wqe->busy_list.nulls = 1; + atomic_set(&wqe->nr_running, 0); + + i++; + } + + init_completion(&wq->done); + + if (i != wq->nr_wqes) + goto err; + + /* caller must have already done mmgrab() on this mm */ + wq->mm = mm; + + wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); + if (!IS_ERR(wq->manager)) { + wake_up_process(wq->manager); + return wq; + } + + ret = PTR_ERR(wq->manager); + wq->manager = NULL; +err: + complete(&wq->done); + io_wq_destroy(wq); + return ERR_PTR(ret); +} + +static bool io_wq_worker_wake(struct io_worker *worker, void *data) +{ + wake_up_process(worker->task); + return false; +} + +void io_wq_destroy(struct io_wq *wq) +{ + int i; + + if (wq->manager) { + set_bit(IO_WQ_BIT_EXIT, &wq->state); + kthread_stop(wq->manager); + } + + rcu_read_lock(); + for (i = 0; i < wq->nr_wqes; i++) { + struct io_wqe *wqe = wq->wqes[i]; + + if (!wqe) + continue; + io_wq_for_each_worker(wqe, &wqe->free_list, io_wq_worker_wake, + NULL); + io_wq_for_each_worker(wqe, &wqe->busy_list, io_wq_worker_wake, + NULL); + } + rcu_read_unlock(); + + wait_for_completion(&wq->done); + + for (i = 0; i < wq->nr_wqes; i++) + kfree(wq->wqes[i]); + kfree(wq->wqes); + kfree(wq); +} diff --git a/fs/io-wq.h b/fs/io-wq.h new file mode 100644 index 000000000000..be8f22c8937b --- /dev/null +++ b/fs/io-wq.h @@ -0,0 +1,55 @@ +#ifndef INTERNAL_IO_WQ_H +#define INTERNAL_IO_WQ_H + +struct io_wq; + +enum { + IO_WQ_WORK_CANCEL = 1, + IO_WQ_WORK_HAS_MM = 2, + IO_WQ_WORK_HASHED = 4, + IO_WQ_WORK_NEEDS_USER = 8, + + IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ +}; + +enum io_wq_cancel { + IO_WQ_CANCEL_OK, /* cancelled before started */ + IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */ + IO_WQ_CANCEL_NOTFOUND, /* work not found */ +}; + +struct io_wq_work { + struct list_head list; + void (*func)(struct io_wq_work **); + unsigned flags; +}; + +#define INIT_IO_WORK(work, _func) \ + do { \ + (work)->func = _func; \ + (work)->flags = 0; \ + } while (0) \ + +struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm); +void io_wq_destroy(struct io_wq *wq); + +void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); +void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val); +void io_wq_flush(struct io_wq *wq); + +void io_wq_cancel_all(struct io_wq *wq); +enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); + +#if defined(CONFIG_IO_WQ) +extern void io_wq_worker_sleeping(struct task_struct *); +extern void io_wq_worker_running(struct task_struct *); +#else +static inline void io_wq_worker_sleeping(struct task_struct *tsk) +{ +} +static inline void io_wq_worker_running(struct task_struct *tsk) +{ +} +#endif + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 67a1d86981a9..6666e25606b7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1468,6 +1468,7 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ +#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd05a378631a..a95a2f05f3b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include #include "../workqueue_internal.h" +#include "../../fs/io-wq.h" #include "../smpboot.h" #include "pelt.h" @@ -4103,9 +4104,12 @@ static inline void sched_submit_work(struct task_struct *tsk) * we disable preemption to avoid it calling schedule() again * in the possible wakeup of a kworker. */ - if (tsk->flags & PF_WQ_WORKER) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - wq_worker_sleeping(tsk); + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else + io_wq_worker_sleeping(tsk); preempt_enable_no_resched(); } @@ -4122,8 +4126,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & PF_WQ_WORKER) - wq_worker_running(tsk); + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); + else + io_wq_worker_running(tsk); + } } asmlinkage __visible void __sched schedule(void) From 561fb04a6a2257716738dac2ed812f377c2634c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Oct 2019 07:25:42 -0600 Subject: [PATCH 12/64] io_uring: replace workqueue usage with io-wq Drop various work-arounds we have for workqueues: - We no longer need the async_list for tracking sequential IO. - We don't have to maintain our own mm tracking/setting. - We don't need a separate workqueue for buffered writes. This didn't even work that well to begin with, as it was suboptimal for multiple buffered writers on multiple files. - We can properly cancel pending interruptible work. This fixes deadlocks with particularly socket IO, where we cannot cancel them when the io_uring is closed. Hence the ring will wait forever for these requests to complete, which may never happen. This is different from disk IO where we know requests will complete in a finite amount of time. - Due to being able to cancel work interruptible work that is already running, we can implement file table support for work. We need that for supporting system calls that add to a process file table. - It gets us one step closer to adding async support for any system call. Signed-off-by: Jens Axboe --- fs/io_uring.c | 419 ++++++++------------------------ include/trace/events/io_uring.h | 12 +- init/Kconfig | 1 + 3 files changed, 108 insertions(+), 324 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f9eff8f62ddb..d94bd4e3a60e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -77,6 +76,7 @@ #include #include "internal.h" +#include "io-wq.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -165,16 +165,6 @@ struct io_mapped_ubuf { unsigned int nr_bvecs; }; -struct async_list { - spinlock_t lock; - atomic_t cnt; - struct list_head list; - - struct file *file; - off_t io_start; - size_t io_len; -}; - struct io_ring_ctx { struct { struct percpu_ref refs; @@ -209,7 +199,7 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* IO offload */ - struct workqueue_struct *sqo_wq[2]; + struct io_wq *io_wq; struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; @@ -262,8 +252,6 @@ struct io_ring_ctx { struct list_head cancel_list; } ____cacheline_aligned_in_smp; - struct async_list pending_async[2]; - #if defined(CONFIG_UNIX) struct socket *ring_sock; #endif @@ -333,7 +321,7 @@ struct io_kiocb { u32 result; u32 sequence; - struct work_struct work; + struct io_wq_work work; }; #define IO_PLUG_THRESHOLD 2 @@ -359,7 +347,7 @@ struct io_submit_state { unsigned int ios_left; }; -static void io_sq_wq_submit_work(struct work_struct *work); +static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, long res); static void __io_free_req(struct io_kiocb *req); @@ -391,7 +379,6 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int i; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -409,11 +396,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_completion(&ctx->sqo_thread_started); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); - for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) { - spin_lock_init(&ctx->pending_async[i].lock); - INIT_LIST_HEAD(&ctx->pending_async[i].list); - atomic_set(&ctx->pending_async[i].cnt, 0); - } spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); @@ -479,22 +461,45 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_queue_async_work(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) { - int rw = 0; + u8 opcode = READ_ONCE(sqe->opcode); + + return !(opcode == IORING_OP_READ_FIXED || + opcode == IORING_OP_WRITE_FIXED); +} + +static inline bool io_prep_async_work(struct io_kiocb *req) +{ + bool do_hashed = false; if (req->submit.sqe) { switch (req->submit.sqe->opcode) { case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - rw = !(req->rw.ki_flags & IOCB_DIRECT); + do_hashed = true; break; } + if (io_sqe_needs_user(req->submit.sqe)) + req->work.flags |= IO_WQ_WORK_NEEDS_USER; } - trace_io_uring_queue_async_work(ctx, rw, req, &req->work, req->flags); - queue_work(ctx->sqo_wq[rw], &req->work); + return do_hashed; +} + +static inline void io_queue_async_work(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + bool do_hashed = io_prep_async_work(req); + + trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, + req->flags); + if (!do_hashed) { + io_wq_enqueue(ctx->io_wq, &req->work); + } else { + io_wq_enqueue_hashed(ctx->io_wq, &req->work, + file_inode(req->file)); + } } static void io_kill_timeout(struct io_kiocb *req) @@ -647,6 +652,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); req->result = 0; + INIT_IO_WORK(&req->work, io_wq_submit_work); return req; out: percpu_ref_put(&ctx->refs); @@ -693,12 +699,10 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * If we're in async work, we can continue processing the chain * in this context instead of having to queue up new async work. */ - if (nxtptr && current_work()) { + if (nxtptr && current_work()) *nxtptr = nxt; - } else { - INIT_WORK(&nxt->work, io_sq_wq_submit_work); + else io_queue_async_work(req->ctx, nxt); - } } } @@ -757,12 +761,10 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr) nxt = io_put_req_find_next(req); if (nxt) { - if (nxtptr) { + if (nxtptr) *nxtptr = nxt; - } else { - INIT_WORK(&nxt->work, io_sq_wq_submit_work); + else io_queue_async_work(nxt->ctx, nxt); - } } } @@ -1324,65 +1326,6 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } -static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb) -{ - if (al->file == kiocb->ki_filp) { - off_t start, end; - - /* - * Allow merging if we're anywhere in the range of the same - * page. Generally this happens for sub-page reads or writes, - * and it's beneficial to allow the first worker to bring the - * page in and the piggy backed work can then work on the - * cached page. - */ - start = al->io_start & PAGE_MASK; - end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK; - if (kiocb->ki_pos >= start && kiocb->ki_pos <= end) - return true; - } - - al->file = NULL; - return false; -} - -/* - * Make a note of the last file/offset/direction we punted to async - * context. We'll use this information to see if we can piggy back a - * sequential request onto the previous one, if it's still hasn't been - * completed by the async worker. - */ -static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) -{ - struct async_list *async_list = &req->ctx->pending_async[rw]; - struct kiocb *kiocb = &req->rw; - struct file *filp = kiocb->ki_filp; - - if (io_should_merge(async_list, kiocb)) { - unsigned long max_bytes; - - /* Use 8x RA size as a decent limiter for both reads/writes */ - max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3); - if (!max_bytes) - max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3); - - /* If max len are exceeded, reset the state */ - if (async_list->io_len + len <= max_bytes) { - req->flags |= REQ_F_SEQ_PREV; - async_list->io_len += len; - } else { - async_list->file = NULL; - } - } - - /* New file? Reset state. */ - if (async_list->file != filp) { - async_list->io_start = kiocb->ki_pos; - async_list->io_len = len; - async_list->file = filp; - } -} - /* * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. @@ -1477,13 +1420,10 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, ret2 > 0 && ret2 < read_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) { + if (!force_nonblock || ret2 != -EAGAIN) kiocb_done(kiocb, ret2, nxt, s->in_async); - } else { - if (!s->in_async) - io_async_list_note(READ, req, iov_count); + else ret = -EAGAIN; - } } kfree(iovec); return ret; @@ -1517,11 +1457,8 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, iov_count = iov_iter_count(&iter); ret = -EAGAIN; - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) { - if (!s->in_async) - io_async_list_note(WRITE, req, iov_count); + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) goto out_free; - } ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); if (!ret) { @@ -1546,13 +1483,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, ret2 = call_write_iter(file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); - if (!force_nonblock || ret2 != -EAGAIN) { + if (!force_nonblock || ret2 != -EAGAIN) kiocb_done(kiocb, ret2, nxt, s->in_async); - } else { - if (!s->in_async) - io_async_list_note(WRITE, req, iov_count); + else ret = -EAGAIN; - } } out_free: kfree(iovec); @@ -1794,14 +1728,18 @@ static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, io_commit_cqring(ctx); } -static void io_poll_complete_work(struct work_struct *work) +static void io_poll_complete_work(struct io_wq_work **workptr) { + struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_poll_iocb *poll = &req->poll; struct poll_table_struct pt = { ._key = poll->events }; struct io_ring_ctx *ctx = req->ctx; __poll_t mask = 0; + if (work->flags & IO_WQ_WORK_CANCEL) + WRITE_ONCE(poll->canceled, true); + if (!READ_ONCE(poll->canceled)) mask = vfs_poll(poll->file, &pt) & poll->events; @@ -1894,7 +1832,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EBADF; req->submit.sqe = NULL; - INIT_WORK(&req->work, io_poll_complete_work); + INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; @@ -2152,7 +2090,6 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); req->submit.sqe = sqe_copy; - INIT_WORK(&req->work, io_sq_wq_submit_work); trace_io_uring_defer(ctx, req, false); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); @@ -2235,188 +2172,56 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } -static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx, - const struct io_uring_sqe *sqe) -{ - switch (sqe->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - return &ctx->pending_async[READ]; - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - return &ctx->pending_async[WRITE]; - default: - return NULL; - } -} - -static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) -{ - u8 opcode = READ_ONCE(sqe->opcode); - - return !(opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED); -} - -static void io_sq_wq_submit_work(struct work_struct *work) +static void io_wq_submit_work(struct io_wq_work **workptr) { + struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_ring_ctx *ctx = req->ctx; - struct mm_struct *cur_mm = NULL; - struct async_list *async_list; - LIST_HEAD(req_list); - mm_segment_t old_fs; - int ret; + struct sqe_submit *s = &req->submit; + const struct io_uring_sqe *sqe = s->sqe; + struct io_kiocb *nxt = NULL; + int ret = 0; - async_list = io_async_list_from_sqe(ctx, req->submit.sqe); -restart: - do { - struct sqe_submit *s = &req->submit; - const struct io_uring_sqe *sqe = s->sqe; - unsigned int flags = req->flags; - struct io_kiocb *nxt = NULL; + /* Ensure we clear previously set non-block flag */ + req->rw.ki_flags &= ~IOCB_NOWAIT; - /* Ensure we clear previously set non-block flag */ - req->rw.ki_flags &= ~IOCB_NOWAIT; + if (work->flags & IO_WQ_WORK_CANCEL) + ret = -ECANCELED; - ret = 0; - if (io_sqe_needs_user(sqe) && !cur_mm) { - if (!mmget_not_zero(ctx->sqo_mm)) { - ret = -EFAULT; - } else { - cur_mm = ctx->sqo_mm; - use_mm(cur_mm); - old_fs = get_fs(); - set_fs(USER_DS); - } - } + if (!ret) { + s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; + s->in_async = true; + do { + ret = __io_submit_sqe(ctx, req, s, &nxt, false); + /* + * We can get EAGAIN for polled IO even though we're + * forcing a sync submission from here, since we can't + * wait for request slots on the block side. + */ + if (ret != -EAGAIN) + break; + cond_resched(); + } while (1); + } - if (!ret) { - s->has_user = cur_mm != NULL; - s->in_async = true; - do { - ret = __io_submit_sqe(ctx, req, s, &nxt, false); - /* - * We can get EAGAIN for polled IO even though - * we're forcing a sync submission from here, - * since we can't wait for request slots on the - * block side. - */ - if (ret != -EAGAIN) - break; - cond_resched(); - } while (1); - } + /* drop submission reference */ + io_put_req(req, NULL); - /* drop submission reference */ + if (ret) { + io_cqring_add_event(ctx, sqe->user_data, ret); io_put_req(req, NULL); - - if (ret) { - io_cqring_add_event(ctx, sqe->user_data, ret); - io_put_req(req, NULL); - } - - /* async context always use a copy of the sqe */ - kfree(sqe); - - /* if a dependent link is ready, do that as the next one */ - if (!ret && nxt) { - req = nxt; - continue; - } - - /* req from defer and link list needn't decrease async cnt */ - if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE)) - goto out; - - if (!async_list) - break; - if (!list_empty(&req_list)) { - req = list_first_entry(&req_list, struct io_kiocb, - list); - list_del(&req->list); - continue; - } - if (list_empty(&async_list->list)) - break; - - req = NULL; - spin_lock(&async_list->lock); - if (list_empty(&async_list->list)) { - spin_unlock(&async_list->lock); - break; - } - list_splice_init(&async_list->list, &req_list); - spin_unlock(&async_list->lock); - - req = list_first_entry(&req_list, struct io_kiocb, list); - list_del(&req->list); - } while (req); - - /* - * Rare case of racing with a submitter. If we find the count has - * dropped to zero AND we have pending work items, then restart - * the processing. This is a tiny race window. - */ - if (async_list) { - ret = atomic_dec_return(&async_list->cnt); - while (!ret && !list_empty(&async_list->list)) { - spin_lock(&async_list->lock); - atomic_inc(&async_list->cnt); - list_splice_init(&async_list->list, &req_list); - spin_unlock(&async_list->lock); - - if (!list_empty(&req_list)) { - req = list_first_entry(&req_list, - struct io_kiocb, list); - list_del(&req->list); - goto restart; - } - ret = atomic_dec_return(&async_list->cnt); - } } -out: - if (cur_mm) { - set_fs(old_fs); - unuse_mm(cur_mm); - mmput(cur_mm); + /* async context always use a copy of the sqe */ + kfree(sqe); + + /* if a dependent link is ready, pass it back */ + if (!ret && nxt) { + io_prep_async_work(nxt); + *workptr = &nxt->work; } } -/* - * See if we can piggy back onto previously submitted work, that is still - * running. We currently only allow this if the new request is sequential - * to the previous one we punted. - */ -static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) -{ - bool ret; - - if (!list) - return false; - if (!(req->flags & REQ_F_SEQ_PREV)) - return false; - if (!atomic_read(&list->cnt)) - return false; - - ret = true; - spin_lock(&list->lock); - list_add_tail(&req->list, &list->list); - /* - * Ensure we see a simultaneous modification from io_sq_wq_submit_work() - */ - smp_mb(); - if (!atomic_read(&list->cnt)) { - list_del_init(&req->list); - ret = false; - } - spin_unlock(&list->lock); - - trace_io_uring_add_to_prev(req, ret); - return ret; -} - static bool io_op_needs_file(const struct io_uring_sqe *sqe) { int op = READ_ONCE(sqe->opcode); @@ -2488,17 +2293,9 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (sqe_copy) { - struct async_list *list; - s->sqe = sqe_copy; memcpy(&req->submit, s, sizeof(*s)); - list = io_async_list_from_sqe(ctx, s->sqe); - if (!io_add_to_prev_work(list, req)) { - if (list) - atomic_inc(&list->cnt); - INIT_WORK(&req->work, io_sq_wq_submit_work); - io_queue_async_work(ctx, req); - } + io_queue_async_work(ctx, req); /* * Queued up for async execution, worker will release @@ -3109,15 +2906,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) static void io_finish_async(struct io_ring_ctx *ctx) { - int i; - io_sq_thread_stop(ctx); - for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) { - if (ctx->sqo_wq[i]) { - destroy_workqueue(ctx->sqo_wq[i]); - ctx->sqo_wq[i] = NULL; - } + if (ctx->io_wq) { + io_wq_destroy(ctx->io_wq); + ctx->io_wq = NULL; } } @@ -3125,11 +2918,9 @@ static void io_finish_async(struct io_ring_ctx *ctx) static void io_destruct_skb(struct sk_buff *skb) { struct io_ring_ctx *ctx = skb->sk->sk_user_data; - int i; - for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) - if (ctx->sqo_wq[i]) - flush_workqueue(ctx->sqo_wq[i]); + if (ctx->io_wq) + io_wq_flush(ctx->io_wq); unix_destruct_scm(skb); } @@ -3473,6 +3264,7 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { + unsigned concurrency; int ret; init_waitqueue_head(&ctx->sqo_wait); @@ -3516,25 +3308,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - /* Do QD, or 2 * CPUS, whatever is smallest */ - ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq", - WQ_UNBOUND | WQ_FREEZABLE, - min(ctx->sq_entries - 1, 2 * num_online_cpus())); - if (!ctx->sqo_wq[0]) { - ret = -ENOMEM; - goto err; - } - - /* - * This is for buffered writes, where we want to limit the parallelism - * due to file locking in file systems. As "normal" buffered writes - * should parellelize on writeout quite nicely, limit us to having 2 - * pending. This avoids massive contention on the inode when doing - * buffered async writes. - */ - ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq", - WQ_UNBOUND | WQ_FREEZABLE, 2); - if (!ctx->sqo_wq[1]) { + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm); + if (!ctx->io_wq) { ret = -ENOMEM; goto err; } @@ -3919,6 +3696,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_kill_timeouts(ctx); io_poll_remove_all(ctx); + + if (ctx->io_wq) + io_wq_cancel_all(ctx->io_wq); + io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); io_ring_ctx_free(ctx); diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index c5a905fbf1da..b85255121b98 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -7,6 +7,8 @@ #include +struct io_wq_work; + /** * io_uring_create - called after a new io_uring context was prepared * @@ -126,15 +128,15 @@ TRACE_EVENT(io_uring_file_get, * io_uring_queue_async_work - called before submitting a new async work * * @ctx: pointer to a ring context structure - * @rw: type of workqueue, normal or buffered writes + * @hashed: type of workqueue, hashed or normal * @req: pointer to a submitted request - * @work: pointer to a submitted work_struct + * @work: pointer to a submitted io_wq_work * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, - TP_PROTO(void *ctx, int rw, void * req, struct work_struct *work, + TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work, unsigned int flags), TP_ARGS(ctx, rw, req, work, flags), @@ -143,7 +145,7 @@ TRACE_EVENT(io_uring_queue_async_work, __field( void *, ctx ) __field( int, rw ) __field( void *, req ) - __field( struct work_struct *, work ) + __field( struct io_wq_work *, work ) __field( unsigned int, flags ) ), @@ -157,7 +159,7 @@ TRACE_EVENT(io_uring_queue_async_work, TP_printk("ring %p, request %p, flags %d, %s queue, work %p", __entry->ctx, __entry->req, __entry->flags, - __entry->rw ? "buffered" : "normal", __entry->work) + __entry->rw ? "hashed" : "normal", __entry->work) ); /** diff --git a/init/Kconfig b/init/Kconfig index b4daad2bac23..4d8d145c41d2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1548,6 +1548,7 @@ config AIO config IO_URING bool "Enable IO uring support" if EXPERT select ANON_INODES + select IO_WQ default y help This option enables support for the io_uring interface, enabling From fcb323cc53e29d9cc696d606bb42736b32dd9825 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Oct 2019 12:39:47 -0600 Subject: [PATCH 13/64] io_uring: io_uring: add support for async work inheriting files This is in preparation for adding opcodes that need to add new files in a process file table, system calls like open(2) or accept4(2). If an opcode needs this, it must set IO_WQ_WORK_NEEDS_FILES in the work item. If work that needs to get punted to async context have this set, the async worker will assume the original task file table before executing the work. Note that opcodes that need access to the current files of an application cannot be done through IORING_SETUP_SQPOLL. Signed-off-by: Jens Axboe --- fs/io-wq.c | 30 +++++++++++-- fs/io-wq.h | 3 ++ fs/io_uring.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 141 insertions(+), 8 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 37863879e987..253c04a40db5 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -52,6 +52,7 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; + struct files_struct *restore_files; }; struct io_wq_nulls_list { @@ -126,22 +127,36 @@ static void io_worker_release(struct io_worker *worker) */ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) { + bool dropped_lock = false; + + if (current->files != worker->restore_files) { + __acquire(&wqe->lock); + spin_unlock_irq(&wqe->lock); + dropped_lock = true; + + task_lock(current); + current->files = worker->restore_files; + task_unlock(current); + } + /* * If we have an active mm, we need to drop the wq lock before unusing * it. If we do, return true and let the caller retry the idle loop. */ if (worker->mm) { - __acquire(&wqe->lock); - spin_unlock_irq(&wqe->lock); + if (!dropped_lock) { + __acquire(&wqe->lock); + spin_unlock_irq(&wqe->lock); + dropped_lock = true; + } __set_current_state(TASK_RUNNING); set_fs(KERNEL_DS); unuse_mm(worker->mm); mmput(worker->mm); worker->mm = NULL; - return true; } - return false; + return dropped_lock; } static void io_worker_exit(struct io_worker *worker) @@ -189,6 +204,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) current->flags |= PF_IO_WORKER; worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + worker->restore_files = current->files; atomic_inc(&wqe->nr_running); } @@ -291,6 +307,12 @@ static void io_worker_handle_work(struct io_worker *worker) if (!work) break; next: + if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && + current->files != work->files) { + task_lock(current); + current->files = work->files; + task_unlock(current); + } if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && wq->mm && mmget_not_zero(wq->mm)) { use_mm(wq->mm); diff --git a/fs/io-wq.h b/fs/io-wq.h index be8f22c8937b..e93f764b1fa4 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -8,6 +8,7 @@ enum { IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_NEEDS_USER = 8, + IO_WQ_WORK_NEEDS_FILES = 16, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -22,12 +23,14 @@ struct io_wq_work { struct list_head list; void (*func)(struct io_wq_work **); unsigned flags; + struct files_struct *files; }; #define INIT_IO_WORK(work, _func) \ do { \ (work)->func = _func; \ (work)->flags = 0; \ + (work)->files = NULL; \ } while (0) \ struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm); diff --git a/fs/io_uring.c b/fs/io_uring.c index d94bd4e3a60e..6e1523567920 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -196,6 +196,8 @@ struct io_ring_ctx { struct list_head defer_list; struct list_head timeout_list; + + wait_queue_head_t inflight_wait; } ____cacheline_aligned_in_smp; /* IO offload */ @@ -250,6 +252,9 @@ struct io_ring_ctx { */ struct list_head poll_list; struct list_head cancel_list; + + spinlock_t inflight_lock; + struct list_head inflight_list; } ____cacheline_aligned_in_smp; #if defined(CONFIG_UNIX) @@ -259,6 +264,8 @@ struct io_ring_ctx { struct sqe_submit { const struct io_uring_sqe *sqe; + struct file *ring_file; + int ring_fd; u32 sequence; bool has_user; bool in_async; @@ -317,10 +324,13 @@ struct io_kiocb { #define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ +#define REQ_F_INFLIGHT 8192 /* on inflight list */ u64 user_data; u32 result; u32 sequence; + struct list_head inflight_entry; + struct io_wq_work work; }; @@ -401,6 +411,9 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cancel_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); + init_waitqueue_head(&ctx->inflight_wait); + spin_lock_init(&ctx->inflight_lock); + INIT_LIST_HEAD(&ctx->inflight_list); return ctx; } @@ -670,9 +683,20 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) static void __io_free_req(struct io_kiocb *req) { + struct io_ring_ctx *ctx = req->ctx; + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); - percpu_ref_put(&req->ctx->refs); + if (req->flags & REQ_F_INFLIGHT) { + unsigned long flags; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + } + percpu_ref_put(&ctx->refs); kmem_cache_free(req_cachep, req); } @@ -2276,6 +2300,30 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, return 0; } +static int io_grab_files(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + int ret = -EBADF; + + rcu_read_lock(); + spin_lock_irq(&ctx->inflight_lock); + /* + * We use the f_ops->flush() handler to ensure that we can flush + * out work accessing these files if the fd is closed. Check if + * the fd has changed since we started down this path, and disallow + * this operation if it has. + */ + if (fcheck(req->submit.ring_fd) == req->submit.ring_file) { + list_add(&req->inflight_entry, &ctx->inflight_list); + req->flags |= REQ_F_INFLIGHT; + req->work.files = current->files; + ret = 0; + } + spin_unlock_irq(&ctx->inflight_lock); + rcu_read_unlock(); + + return ret; +} + static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, struct sqe_submit *s) { @@ -2295,17 +2343,25 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (sqe_copy) { s->sqe = sqe_copy; memcpy(&req->submit, s, sizeof(*s)); - io_queue_async_work(ctx, req); + if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { + ret = io_grab_files(ctx, req); + if (ret) { + kfree(sqe_copy); + goto err; + } + } /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ + io_queue_async_work(ctx, req); return 0; } } /* drop submission reference */ +err: io_put_req(req, NULL); /* and drop final reference, if we failed */ @@ -2509,6 +2565,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) head = READ_ONCE(sq_array[head & ctx->sq_mask]); if (head < ctx->sq_entries) { + s->ring_file = NULL; s->sqe = &ctx->sq_sqes[head]; s->sequence = ctx->cached_sq_head; ctx->cached_sq_head++; @@ -2708,7 +2765,8 @@ static int io_sq_thread(void *data) return 0; } -static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, + struct file *ring_file, int ring_fd) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; @@ -2750,9 +2808,11 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) } out: + s.ring_file = ring_file; s.has_user = true; s.in_async = false; s.needs_fixed_file = false; + s.ring_fd = ring_fd; submit++; trace_io_uring_submit_sqe(ctx, true, false); io_submit_sqe(ctx, &s, statep, &link); @@ -3714,6 +3774,53 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } +static void io_uring_cancel_files(struct io_ring_ctx *ctx, + struct files_struct *files) +{ + struct io_kiocb *req; + DEFINE_WAIT(wait); + + while (!list_empty_careful(&ctx->inflight_list)) { + enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; + + spin_lock_irq(&ctx->inflight_lock); + list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { + if (req->work.files == files) { + ret = io_wq_cancel_work(ctx->io_wq, &req->work); + break; + } + } + if (ret == IO_WQ_CANCEL_RUNNING) + prepare_to_wait(&ctx->inflight_wait, &wait, + TASK_UNINTERRUPTIBLE); + + spin_unlock_irq(&ctx->inflight_lock); + + /* + * We need to keep going until we get NOTFOUND. We only cancel + * one work at the time. + * + * If we get CANCEL_RUNNING, then wait for a work to complete + * before continuing. + */ + if (ret == IO_WQ_CANCEL_OK) + continue; + else if (ret != IO_WQ_CANCEL_RUNNING) + break; + schedule(); + } +} + +static int io_uring_flush(struct file *file, void *data) +{ + struct io_ring_ctx *ctx = file->private_data; + + io_uring_cancel_files(ctx, data); + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) + io_wq_cancel_all(ctx->io_wq); + return 0; +} + static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT; @@ -3782,7 +3889,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); - submitted = io_ring_submit(ctx, to_submit); + submitted = io_ring_submit(ctx, to_submit, f.file, fd); mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { @@ -3805,6 +3912,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, static const struct file_operations io_uring_fops = { .release = io_uring_release, + .flush = io_uring_flush, .mmap = io_uring_mmap, .poll = io_uring_poll, .fasync = io_uring_fasync, From de2ea4b64b75a79ed9cdf9bf30e0e197901084e4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Oct 2019 14:41:29 -0600 Subject: [PATCH 14/64] net: add __sys_accept4_file() helper This is identical to __sys_accept4(), except it takes a struct file instead of an fd, and it also allows passing in extra file->f_flags flags. The latter is done to support masking in O_NONBLOCK without manipulating the original file flags. No functional changes in this patch. Cc: netdev@vger.kernel.org Acked-by: David S. Miller Signed-off-by: Jens Axboe --- include/linux/socket.h | 3 ++ net/socket.c | 65 ++++++++++++++++++++++++++---------------- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index fc0bed59fc84..dd061f741bc1 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -392,6 +392,9 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, extern int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len); +extern int __sys_accept4_file(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); diff --git a/net/socket.c b/net/socket.c index 6a9ab7a8b1d2..40ab39f6c5d8 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1690,24 +1690,13 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) return __sys_listen(fd, backlog); } -/* - * For accept, we attempt to create a new socket, set up the link - * with the client, wake up the client, then return the new - * connected fd. We collect the address of the connector in kernel - * space and move it to user at the very end. This is unclean because - * we open the socket then return an error. - * - * 1003.1g adds the ability to recvmsg() to query connection pending - * status to recvmsg. We need to add that support in a way thats - * clean when we restructure accept also. - */ - -int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags) +int __sys_accept4_file(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; - int err, len, newfd, fput_needed; + int err, len, newfd; struct sockaddr_storage address; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) @@ -1716,14 +1705,14 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - sock = sockfd_lookup_light(fd, &err, &fput_needed); + sock = sock_from_file(file, &err); if (!sock) goto out; err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out_put; + goto out; newsock->type = sock->type; newsock->ops = sock->ops; @@ -1738,20 +1727,21 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); - goto out_put; + goto out; } newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); if (IS_ERR(newfile)) { err = PTR_ERR(newfile); put_unused_fd(newfd); - goto out_put; + goto out; } err = security_socket_accept(sock, newsock); if (err) goto out_fd; - err = sock->ops->accept(sock, newsock, sock->file->f_flags, false); + err = sock->ops->accept(sock, newsock, sock->file->f_flags | file_flags, + false); if (err < 0) goto out_fd; @@ -1772,15 +1762,42 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, fd_install(newfd, newfile); err = newfd; - -out_put: - fput_light(sock->file, fput_needed); out: return err; out_fd: fput(newfile); put_unused_fd(newfd); - goto out_put; + goto out; + +} + +/* + * For accept, we attempt to create a new socket, set up the link + * with the client, wake up the client, then return the new + * connected fd. We collect the address of the connector in kernel + * space and move it to user at the very end. This is unclean because + * we open the socket then return an error. + * + * 1003.1g adds the ability to recvmsg() to query connection pending + * status to recvmsg. We need to add that support in a way thats + * clean when we restructure accept also. + */ + +int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags) +{ + int ret = -EBADF; + struct fd f; + + f = fdget(fd); + if (f.file) { + ret = __sys_accept4_file(f.file, 0, upeer_sockaddr, + upeer_addrlen, flags); + if (f.flags) + fput(f.file); + } + + return ret; } SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, From 17f2fe35d080d8f64e86a60cdcd3a97edcbc213b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Oct 2019 14:42:58 -0600 Subject: [PATCH 15/64] io_uring: add support for IORING_OP_ACCEPT This allows an application to call accept4() in an async fashion. Like other opcodes, we first try a non-blocking accept, then punt to async context if we have to. Signed-off-by: Jens Axboe --- fs/io_uring.c | 37 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 7 ++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e1523567920..b668149c20b9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1686,6 +1686,40 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } +static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct sockaddr __user *addr; + int __user *addr_len; + unsigned file_flags; + int flags, ret; + + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + return -EINVAL; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); + flags = READ_ONCE(sqe->accept_flags); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags); + if (ret == -EAGAIN && force_nonblock) { + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + return -EAGAIN; + } + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_put_req(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -2173,6 +2207,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_TIMEOUT_REMOVE: ret = io_timeout_remove(req, s->sqe); break; + case IORING_OP_ACCEPT: + ret = io_accept(req, s->sqe, nxt, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6dc5ced1c37a..f82d90e617a6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -19,7 +19,10 @@ struct io_uring_sqe { __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ - __u64 off; /* offset into file */ + union { + __u64 off; /* offset into file */ + __u64 addr2; + }; __u64 addr; /* pointer to buffer or iovecs */ __u32 len; /* buffer size or number of iovecs */ union { @@ -29,6 +32,7 @@ struct io_uring_sqe { __u32 sync_range_flags; __u32 msg_flags; __u32 timeout_flags; + __u32 accept_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -65,6 +69,7 @@ struct io_uring_sqe { #define IORING_OP_RECVMSG 10 #define IORING_OP_TIMEOUT 11 #define IORING_OP_TIMEOUT_REMOVE 12 +#define IORING_OP_ACCEPT 13 /* * sqe->fsync_flags From b7620121dc04e44ce654297050f9eaf39d414a34 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 26 Oct 2019 07:22:55 -0600 Subject: [PATCH 16/64] io_uring: protect fixed file indexing with array_index_nospec() We index the file tables with a user given value. After we check it's within our limits, use array_index_nospec() to prevent any spectre attacks here. Suggested-by: Jann Horn Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index b668149c20b9..7743b180a3e0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2321,6 +2321,7 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, if (unlikely(!ctx->user_files || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; + fd = array_index_nospec(fd, ctx->nr_user_files); if (!ctx->user_files[fd]) return -EBADF; req->file = ctx->user_files[fd]; From 65e19f54d29cd8559ce60cfd0d751bef7afbdc5c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 26 Oct 2019 07:20:21 -0600 Subject: [PATCH 17/64] io_uring: support for larger fixed file sets There's been a few requests for supporting more fixed files than 1024. This isn't really tricky to do, we just need to split up the file table into multiple tables and index appropriately. As we do so, reduce the max single file table to 512. This enables us to do single page allocs always for the tables, which is an improvement over the situation prior. This patch adds support for up to 64K files, which should be enough for everyone. Signed-off-by: Jens Axboe --- fs/io_uring.c | 150 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 117 insertions(+), 33 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7743b180a3e0..281d0b7597cf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -80,7 +80,14 @@ #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -#define IORING_MAX_FIXED_FILES 1024 + +/* + * Shift of 9 is 512 entries, or exactly one page on 64-bit archs + */ +#define IORING_FILE_TABLE_SHIFT 9 +#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) +#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) +#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -165,6 +172,10 @@ struct io_mapped_ubuf { unsigned int nr_bvecs; }; +struct fixed_file_table { + struct file **files; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -225,7 +236,7 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct file **user_files; + struct fixed_file_table *file_table; unsigned nr_user_files; /* if used, fixed mapped user buffers */ @@ -2296,6 +2307,15 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe) } } +static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, + int index) +{ + struct fixed_file_table *table; + + table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; + return table->files[index & IORING_FILE_TABLE_MASK]; +} + static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, struct io_submit_state *state, struct io_kiocb *req) { @@ -2318,13 +2338,13 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, return 0; if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || + if (unlikely(!ctx->file_table || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); - if (!ctx->user_files[fd]) + req->file = io_file_from_index(ctx, fd); + if (!req->file) return -EBADF; - req->file = ctx->user_files[fd]; req->flags |= REQ_F_FIXED_FILE; } else { if (s->needs_fixed_file) @@ -2969,20 +2989,29 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #else int i; - for (i = 0; i < ctx->nr_user_files; i++) - if (ctx->user_files[i]) - fput(ctx->user_files[i]); + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file; + + file = io_file_from_index(ctx, i); + if (file) + fput(file); + } #endif } static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { - if (!ctx->user_files) + unsigned nr_tables, i; + + if (!ctx->file_table) return -ENXIO; __io_sqe_files_unregister(ctx); - kfree(ctx->user_files); - ctx->user_files = NULL; + nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); + for (i = 0; i < nr_tables; i++) + kfree(ctx->file_table[i].files); + kfree(ctx->file_table); + ctx->file_table = NULL; ctx->nr_user_files = 0; return 0; } @@ -3057,9 +3086,11 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) nr_files = 0; fpl->user = get_uid(ctx->user); for (i = 0; i < nr; i++) { - if (!ctx->user_files[i + offset]) + struct file *file = io_file_from_index(ctx, i + offset); + + if (!file) continue; - fpl->fp[nr_files] = get_file(ctx->user_files[i + offset]); + fpl->fp[nr_files] = get_file(file); unix_inflight(fpl->user, fpl->fp[nr_files]); nr_files++; } @@ -3108,8 +3139,10 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) return 0; while (total < ctx->nr_user_files) { - if (ctx->user_files[total]) - fput(ctx->user_files[total]); + struct file *file = io_file_from_index(ctx, total); + + if (file) + fput(file); total++; } @@ -3122,25 +3155,63 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) } #endif +static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, + unsigned nr_files) +{ + int i; + + for (i = 0; i < nr_tables; i++) { + struct fixed_file_table *table = &ctx->file_table[i]; + unsigned this_files; + + this_files = min(nr_files, IORING_MAX_FILES_TABLE); + table->files = kcalloc(this_files, sizeof(struct file *), + GFP_KERNEL); + if (!table->files) + break; + nr_files -= this_files; + } + + if (i == nr_tables) + return 0; + + for (i = 0; i < nr_tables; i++) { + struct fixed_file_table *table = &ctx->file_table[i]; + kfree(table->files); + } + return 1; +} + static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; + unsigned nr_tables; int fd, ret = 0; unsigned i; - if (ctx->user_files) + if (ctx->file_table) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; - ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL); - if (!ctx->user_files) + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); + ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), + GFP_KERNEL); + if (!ctx->file_table) return -ENOMEM; + if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { + kfree(ctx->file_table); + return -ENOMEM; + } + for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { + struct fixed_file_table *table; + unsigned index; + ret = -EFAULT; if (copy_from_user(&fd, &fds[i], sizeof(fd))) break; @@ -3150,10 +3221,12 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, continue; } - ctx->user_files[i] = fget(fd); + table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + index = i & IORING_FILE_TABLE_MASK; + table->files[index] = fget(fd); ret = -EBADF; - if (!ctx->user_files[i]) + if (!table->files[index]) break; /* * Don't allow io_uring instances to be registered. If UNIX @@ -3162,20 +3235,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (ctx->user_files[i]->f_op == &io_uring_fops) { - fput(ctx->user_files[i]); + if (table->files[index]->f_op == &io_uring_fops) { + fput(table->files[index]); break; } ret = 0; } if (ret) { - for (i = 0; i < ctx->nr_user_files; i++) - if (ctx->user_files[i]) - fput(ctx->user_files[i]); + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file; - kfree(ctx->user_files); - ctx->user_files = NULL; + file = io_file_from_index(ctx, i); + if (file) + fput(file); + } + for (i = 0; i < nr_tables; i++) + kfree(ctx->file_table[i].files); + + kfree(ctx->file_table); + ctx->file_table = NULL; ctx->nr_user_files = 0; return ret; } @@ -3190,7 +3269,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) { #if defined(CONFIG_UNIX) - struct file *file = ctx->user_files[index]; + struct file *file = io_file_from_index(ctx, index); struct sock *sock = ctx->ring_sock->sk; struct sk_buff_head list, *head = &sock->sk_receive_queue; struct sk_buff *skb; @@ -3246,7 +3325,7 @@ static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) spin_unlock_irq(&head->lock); } #else - fput(ctx->user_files[index]); + fput(io_file_from_index(ctx, index)); #endif } @@ -3301,7 +3380,7 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, int fd, i, err; __u32 done; - if (!ctx->user_files) + if (!ctx->file_table) return -ENXIO; if (!nr_args) return -EINVAL; @@ -3315,15 +3394,20 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, done = 0; fds = (__s32 __user *) up.fds; while (nr_args) { + struct fixed_file_table *table; + unsigned index; + err = 0; if (copy_from_user(&fd, &fds[done], sizeof(fd))) { err = -EFAULT; break; } i = array_index_nospec(up.offset, ctx->nr_user_files); - if (ctx->user_files[i]) { + table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + index = i & IORING_FILE_TABLE_MASK; + if (table->files[index]) { io_sqe_file_unregister(ctx, i); - ctx->user_files[i] = NULL; + table->files[index] = NULL; } if (fd != -1) { struct file *file; @@ -3346,7 +3430,7 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, err = -EBADF; break; } - ctx->user_files[i] = file; + table->files[index] = file; err = io_sqe_file_register(ctx, file, i); if (err) break; From 842f96124c5617b060cc0f071dcfb6ab24bdd042 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 29 Oct 2019 12:34:10 -0600 Subject: [PATCH 18/64] io_uring: fix race with canceling timeouts If we get -1 from hrtimer_try_to_cancel(), we know that the timer is running. Hence leave all completion to the timeout handler. If we don't, we can corrupt the list and miss a completion. Fixes: 11365043e527 ("io_uring: add support for canceling timeout requests") Reported-by: Hrvoje Zeba Tested-by: Hrvoje Zeba Signed-off-by: Jens Axboe --- fs/io_uring.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 281d0b7597cf..8e25c25c7309 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -533,7 +533,7 @@ static void io_kill_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&req->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); - list_del(&req->list); + list_del_init(&req->list); io_cqring_fill_event(req->ctx, req->user_data, 0); __io_free_req(req); } @@ -1957,7 +1957,6 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx; struct io_kiocb *req; unsigned long flags; - bool comp; req = container_of(timer, struct io_kiocb, timeout.timer); ctx = req->ctx; @@ -1968,8 +1967,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. */ - comp = !list_empty(&req->list); - if (comp) { + if (!list_empty(&req->list)) { struct io_kiocb *prev; /* @@ -1981,17 +1979,15 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) prev = req; list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list) prev->sequence++; - list_del_init(&req->list); - io_cqring_fill_event(ctx, req->user_data, -ETIME); - io_commit_cqring(ctx); } + + io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - if (comp) { - io_cqring_ev_posted(ctx); - io_put_req(req, NULL); - } + io_cqring_ev_posted(ctx); + io_put_req(req, NULL); return HRTIMER_NORESTART; } @@ -2131,9 +2127,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->sequence -= span; list_add(&req->list, entry); - spin_unlock_irq(&ctx->completion_lock); req->timeout.timer.function = io_timeout_fn; hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode); + spin_unlock_irq(&ctx->completion_lock); return 0; } From 975c99a570967dd48e917dd7853867fee3febabd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Oct 2019 08:42:56 -0600 Subject: [PATCH 19/64] io_uring: io_wq_create() returns an error pointer, not NULL syzbot reported an issue where we crash at setup time if failslab is used. The issue is that io_wq_create() returns an error pointer on failure, not NULL. Hence io_uring thought the io-wq was setup just fine, but in reality it's a garbage error pointer. Use IS_ERR() instead of a NULL check, and assign ret appropriately. Reported-by: syzbot+221cc24572a2fed23b6b@syzkaller.appspotmail.com Fixes: 561fb04a6a22 ("io_uring: replace workqueue usage with io-wq") Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8e25c25c7309..72d260520c8f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3489,8 +3489,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm); - if (!ctx->io_wq) { - ret = -ENOMEM; + if (IS_ERR(ctx->io_wq)) { + ret = PTR_ERR(ctx->io_wq); + ctx->io_wq = NULL; goto err; } From 62755e35dfb2b113c52b81cd96d01c20971c8e02 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Oct 2019 21:49:21 -0600 Subject: [PATCH 20/64] io_uring: support for generic async request cancel This adds support for IORING_OP_ASYNC_CANCEL, which will attempt to cancel requests that have been punted to async context and are now in-flight. This works for regular read/write requests to files, as long as they haven't been started yet. For socket based IO (or things like accept4(2)), we can cancel work that is already running as well. To cancel a request, the sqe must have ->addr set to the user_data of the request it wishes to cancel. If the request is cancelled successfully, the original request is completed with -ECANCELED and the cancel request is completed with a result of 0. If the request was already running, the original may or may not complete in error. The cancel request will complete with -EALREADY for that case. And finally, if the request to cancel wasn't found, the cancel request is completed with -ENOENT. Signed-off-by: Jens Axboe --- fs/io-wq.c | 85 +++++++++++++++++++++++++++++++++++ fs/io-wq.h | 5 +++ fs/io_uring.c | 45 +++++++++++++++++++ include/uapi/linux/io_uring.h | 2 + 4 files changed, 137 insertions(+) diff --git a/fs/io-wq.c b/fs/io-wq.c index 253c04a40db5..652b8bac2dbc 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -639,6 +639,91 @@ void io_wq_cancel_all(struct io_wq *wq) rcu_read_unlock(); } +struct io_cb_cancel_data { + struct io_wqe *wqe; + work_cancel_fn *cancel; + void *caller_data; +}; + +static bool io_work_cancel(struct io_worker *worker, void *cancel_data) +{ + struct io_cb_cancel_data *data = cancel_data; + struct io_wqe *wqe = data->wqe; + bool ret = false; + + /* + * Hold the lock to avoid ->cur_work going out of scope, caller + * may deference the passed in work. + */ + spin_lock_irq(&wqe->lock); + if (worker->cur_work && + data->cancel(worker->cur_work, data->caller_data)) { + send_sig(SIGINT, worker->task, 1); + ret = true; + } + spin_unlock_irq(&wqe->lock); + + return ret; +} + +static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, + work_cancel_fn *cancel, + void *cancel_data) +{ + struct io_cb_cancel_data data = { + .wqe = wqe, + .cancel = cancel, + .caller_data = cancel_data, + }; + struct io_wq_work *work; + bool found = false; + + spin_lock_irq(&wqe->lock); + list_for_each_entry(work, &wqe->work_list, list) { + if (cancel(work, cancel_data)) { + list_del(&work->list); + found = true; + break; + } + } + spin_unlock_irq(&wqe->lock); + + if (found) { + work->flags |= IO_WQ_WORK_CANCEL; + work->func(&work); + return IO_WQ_CANCEL_OK; + } + + rcu_read_lock(); + found = io_wq_for_each_worker(wqe, &wqe->free_list, io_work_cancel, + &data); + if (found) + goto done; + + found = io_wq_for_each_worker(wqe, &wqe->busy_list, io_work_cancel, + &data); +done: + rcu_read_unlock(); + return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; +} + +enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, + void *data) +{ + enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; + int i; + + for (i = 0; i < wq->nr_wqes; i++) { + struct io_wqe *wqe = wq->wqes[i]; + + ret = io_wqe_cancel_cb_work(wqe, cancel, data); + if (ret != IO_WQ_CANCEL_NOTFOUND) + break; + } + + return ret; +} + static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_wq_work *work = data; diff --git a/fs/io-wq.h b/fs/io-wq.h index e93f764b1fa4..3de192dc73fc 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -43,6 +43,11 @@ void io_wq_flush(struct io_wq *wq); void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); +typedef bool (work_cancel_fn)(struct io_wq_work *, void *); + +enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, + void *data); + #if defined(CONFIG_IO_WQ) extern void io_wq_worker_sleeping(struct task_struct *); extern void io_wq_worker_running(struct task_struct *); diff --git a/fs/io_uring.c b/fs/io_uring.c index 72d260520c8f..76d653085987 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2133,6 +2133,48 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static bool io_cancel_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->user_data == (unsigned long) data; +} + +static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt) +{ + struct io_ring_ctx *ctx = req->ctx; + enum io_wq_cancel cancel_ret; + void *sqe_addr; + int ret = 0; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || + sqe->cancel_flags) + return -EINVAL; + + sqe_addr = (void *) (unsigned long) READ_ONCE(sqe->addr); + cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr); + switch (cancel_ret) { + case IO_WQ_CANCEL_OK: + ret = 0; + break; + case IO_WQ_CANCEL_RUNNING: + ret = -EALREADY; + break; + case IO_WQ_CANCEL_NOTFOUND: + ret = -ENOENT; + break; + } + + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_put_req(req, nxt); + return 0; +} + static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -2217,6 +2259,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_ACCEPT: ret = io_accept(req, s->sqe, nxt, force_nonblock); break; + case IORING_OP_ASYNC_CANCEL: + ret = io_async_cancel(req, s->sqe, nxt); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f82d90e617a6..6877cf8894db 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -33,6 +33,7 @@ struct io_uring_sqe { __u32 msg_flags; __u32 timeout_flags; __u32 accept_flags; + __u32 cancel_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -70,6 +71,7 @@ struct io_uring_sqe { #define IORING_OP_TIMEOUT 11 #define IORING_OP_TIMEOUT_REMOVE 12 #define IORING_OP_ACCEPT 13 +#define IORING_OP_ASYNC_CANCEL 14 /* * sqe->fsync_flags From e9ffa5c2b77edf2689f876b640318b16fc3ea2a7 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Tue, 29 Oct 2019 11:16:42 +0800 Subject: [PATCH 21/64] io_uring: set -EINTR directly when a signal wakes up in io_cqring_wait We didn't use -ERESTARTSYS to tell the application layer to restart the system call, but instead return -EINTR. we can set -EINTR directly when wakeup by the signal, which can help us save an assignment operation and comparison operation. Reviewed-by: Bob Liu Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 76d653085987..a520c4262d85 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2976,7 +2976,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, .to_wait = min_events, }; struct io_rings *rings = ctx->rings; - int ret; + int ret = 0; if (io_cqring_events(rings) >= min_events) return 0; @@ -2994,7 +2994,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = 0; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); trace_io_uring_cqring_wait(ctx, min_events); do { @@ -3004,15 +3003,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, break; schedule(); if (signal_pending(current)) { - ret = -ERESTARTSYS; + ret = -EINTR; break; } } while (1); finish_wait(&ctx->wait, &iowq.wq); - restore_saved_sigmask_unless(ret == -ERESTARTSYS); - if (ret == -ERESTARTSYS) - ret = -EINTR; + restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } From 0069fc6b1cf28de3a3890ed7c87a5b8ab79ca528 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Nov 2019 12:44:40 -0600 Subject: [PATCH 22/64] io_uring: remove io_uring_add_to_prev() trace event This internal logic was killed with the conversion to io-wq, so we no longer have a need for this particular trace. Kill it. Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index b85255121b98..8f21d8bf20fd 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -227,35 +227,6 @@ TRACE_EVENT(io_uring_link, __entry->ctx, __entry->req, __entry->target_req) ); -/** - * io_uring_add_to_prev - called after a request was added into a previously - * submitted work - * - * @req: pointer to a request, added to a previous - * @ret: whether or not it was completed successfully - * - * Allows to track merged work, to figure out how often requests are piggy - * backed into other ones, changing the execution flow. - */ -TRACE_EVENT(io_uring_add_to_prev, - - TP_PROTO(void *req, bool ret), - - TP_ARGS(req, ret), - - TP_STRUCT__entry ( - __field( void *, req ) - __field( bool, ret ) - ), - - TP_fast_assign( - __entry->req = req; - __entry->ret = ret; - ), - - TP_printk("request %p, ret %d", __entry->req, __entry->ret) -); - /** * io_uring_cqring_wait - called before start waiting for an available CQE * From 364b05fd06e87e53dc03396f73afeac48d8e0998 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 2 Nov 2019 15:55:01 +0800 Subject: [PATCH 23/64] io-wq: use kfree_rcu() to simplify the code The callback function of call_rcu() just calls kfree(), so we can use kfree_rcu() instead of call_rcu() + callback function. Signed-off-by: YueHaibing Signed-off-by: Jens Axboe --- fs/io-wq.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 652b8bac2dbc..3bbab2c58695 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -102,13 +102,6 @@ struct io_wq { struct completion done; }; -static void io_wq_free_worker(struct rcu_head *head) -{ - struct io_worker *worker = container_of(head, struct io_worker, rcu); - - kfree(worker); -} - static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -194,7 +187,7 @@ static void io_worker_exit(struct io_worker *worker) if (all_done && refcount_dec_and_test(&wqe->wq->refs)) complete(&wqe->wq->done); - call_rcu(&worker->rcu, io_wq_free_worker); + kfree_rcu(worker, rcu); } static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) From 51c3ff62cac635ae9d75f875ce5b7bdafc97abd5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 3 Nov 2019 06:52:50 -0700 Subject: [PATCH 24/64] io_uring: add completion trace event We currently don't have a completion event trace, add one of those. And to better be able to match up submissions and completions, add user_data to the submission trace as well. Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++-- include/trace/events/io_uring.h | 54 +++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a520c4262d85..7813bc7d5b61 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -592,6 +592,8 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, { struct io_uring_cqe *cqe; + trace_io_uring_complete(ctx, ki_user_data, res); + /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in @@ -2733,7 +2735,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, s.has_user = *mm != NULL; s.in_async = true; s.needs_fixed_file = true; - trace_io_uring_submit_sqe(ctx, true, true); + trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, true); io_submit_sqe(ctx, &s, statep, &link); submitted++; } @@ -2913,7 +2915,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, s.needs_fixed_file = false; s.ring_fd = ring_fd; submit++; - trace_io_uring_submit_sqe(ctx, true, false); + trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, false); io_submit_sqe(ctx, &s, statep, &link); } diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 8f21d8bf20fd..72a4d0174b02 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -284,10 +284,43 @@ TRACE_EVENT(io_uring_fail_link, TP_printk("request %p, link %p", __entry->req, __entry->link) ); +/** + * io_uring_complete - called when completing an SQE + * + * @ctx: pointer to a ring context structure + * @user_data: user data associated with the request + * @res: result of the request + * + */ +TRACE_EVENT(io_uring_complete, + + TP_PROTO(void *ctx, u64 user_data, long res), + + TP_ARGS(ctx, user_data, res), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u64, user_data ) + __field( long, res ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->user_data = user_data; + __entry->res = res; + ), + + TP_printk("ring %p, user_data 0x%llx, result %ld", + __entry->ctx, (unsigned long long)__entry->user_data, + __entry->res) +); + + /** * io_uring_submit_sqe - called before submitting one SQE * - * @ctx: pointer to a ring context structure + * @ctx: pointer to a ring context structure + * @user_data: user data associated with the request * @force_nonblock: whether a context blocking or not * @sq_thread: true if sq_thread has submitted this SQE * @@ -296,24 +329,27 @@ TRACE_EVENT(io_uring_fail_link, */ TRACE_EVENT(io_uring_submit_sqe, - TP_PROTO(void *ctx, bool force_nonblock, bool sq_thread), + TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread), - TP_ARGS(ctx, force_nonblock, sq_thread), + TP_ARGS(ctx, user_data, force_nonblock, sq_thread), TP_STRUCT__entry ( - __field( void *, ctx ) + __field( void *, ctx ) + __field( u64, user_data ) __field( bool, force_nonblock ) - __field( bool, sq_thread ) + __field( bool, sq_thread ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = ctx; + __entry->user_data = user_data; __entry->force_nonblock = force_nonblock; - __entry->sq_thread = sq_thread; + __entry->sq_thread = sq_thread; ), - TP_printk("ring %p, non block %d, sq_thread %d", - __entry->ctx, __entry->force_nonblock, __entry->sq_thread) + TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d", + __entry->ctx, (unsigned long long) __entry->user_data, + __entry->force_nonblock, __entry->sq_thread) ); #endif /* _TRACE_IO_URING_H */ From 1056ef940380c4e32349ccb6d956858edf70520c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Nov 2019 08:50:02 -0700 Subject: [PATCH 25/64] MAINTAINERS: update io_uring entry We now have a list that's appropriate for both kernel and userspace discussions on io_uring usage and development, add that to the MAINTAINERS entry. Also add the io-wq files. Signed-off-by: Jens Axboe --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c6c34d04ce95..7afb25707098 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8562,12 +8562,13 @@ F: include/linux/iova.h IO_URING M: Jens Axboe -L: linux-block@vger.kernel.org -L: linux-fsdevel@vger.kernel.org +L: io-uring@vger.kernel.org T: git git://git.kernel.dk/linux-block T: git git://git.kernel.dk/liburing S: Maintained F: fs/io_uring.c +F: fs/io-wq.c +F: fs/io-wq.h F: include/uapi/linux/io_uring.h IPMI SUBSYSTEM From 6f72653e76a511db47addad6ab690390233fc250 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Nov 2019 13:51:51 -0700 Subject: [PATCH 26/64] io-wq: use proper nesting IRQ disabling spinlocks for cancel We don't know what context we'll be called in for cancel, it could very well be with IRQs disabled already. Use the IRQ saving variants of the locking primitives. Signed-off-by: Jens Axboe --- fs/io-wq.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 3bbab2c58695..ba40a7ee31c3 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -642,19 +642,20 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data) { struct io_cb_cancel_data *data = cancel_data; struct io_wqe *wqe = data->wqe; + unsigned long flags; bool ret = false; /* * Hold the lock to avoid ->cur_work going out of scope, caller * may deference the passed in work. */ - spin_lock_irq(&wqe->lock); + spin_lock_irqsave(&wqe->lock, flags); if (worker->cur_work && data->cancel(worker->cur_work, data->caller_data)) { send_sig(SIGINT, worker->task, 1); ret = true; } - spin_unlock_irq(&wqe->lock); + spin_unlock_irqrestore(&wqe->lock, flags); return ret; } @@ -669,9 +670,10 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, .caller_data = cancel_data, }; struct io_wq_work *work; + unsigned long flags; bool found = false; - spin_lock_irq(&wqe->lock); + spin_lock_irqsave(&wqe->lock, flags); list_for_each_entry(work, &wqe->work_list, list) { if (cancel(work, cancel_data)) { list_del(&work->list); @@ -679,7 +681,7 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, break; } } - spin_unlock_irq(&wqe->lock); + spin_unlock_irqrestore(&wqe->lock, flags); if (found) { work->flags |= IO_WQ_WORK_CANCEL; @@ -733,6 +735,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, struct io_wq_work *cwork) { struct io_wq_work *work; + unsigned long flags; bool found = false; cwork->flags |= IO_WQ_WORK_CANCEL; @@ -742,7 +745,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, * from there. CANCEL_OK means that the work is returned as-new, * no completion will be posted for it. */ - spin_lock_irq(&wqe->lock); + spin_lock_irqsave(&wqe->lock, flags); list_for_each_entry(work, &wqe->work_list, list) { if (work == cwork) { list_del(&work->list); @@ -750,7 +753,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, break; } } - spin_unlock_irq(&wqe->lock); + spin_unlock_irqrestore(&wqe->lock, flags); if (found) { work->flags |= IO_WQ_WORK_CANCEL; From 89723d0bd6c77540c01ce7db2cd6f8c3be2fd958 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Nov 2019 15:32:58 -0700 Subject: [PATCH 27/64] io_uring: enable optimized link handling for IORING_OP_POLL_ADD As introduced by commit: ba816ad61fdf ("io_uring: run dependent links inline if possible") enable inline dependent link running for poll commands. io_poll_complete_work() is the most important change, as it allows a linked sequence of { POLL, READ } (for example) to proceed inline instead of needing to get punted to another async context. The submission side only potentially matters for sqthread, but may as well include that bit. Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7813bc7d5b61..bda27b52fd5b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1806,6 +1806,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) struct io_poll_iocb *poll = &req->poll; struct poll_table_struct pt = { ._key = poll->events }; struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *nxt = NULL; __poll_t mask = 0; if (work->flags & IO_WQ_WORK_CANCEL) @@ -1832,7 +1833,10 @@ static void io_poll_complete_work(struct io_wq_work **workptr) spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_put_req(req, NULL); + + io_put_req(req, &nxt); + if (nxt) + *workptr = &nxt->work; } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -1886,7 +1890,8 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->req->poll.wait); } -static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt) { struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; @@ -1949,7 +1954,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (mask) { io_cqring_ev_posted(ctx); - io_put_req(req, NULL); + io_put_req(req, nxt); } return ipt.error; } @@ -2238,7 +2243,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_fsync(req, s->sqe, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, s->sqe); + ret = io_poll_add(req, s->sqe, nxt); break; case IORING_OP_POLL_REMOVE: ret = io_poll_remove(req, s->sqe); From f1f40853c01b5ccd0a1a29ce0b515c6f5405a798 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Nov 2019 20:33:16 -0700 Subject: [PATCH 28/64] io_uring: fixup a few spots where link failure isn't flagged If a request fails, we need to ensure we set REQ_F_FAIL_LINK on it if REQ_F_LINK is set. Any failure in the chain should break the chain. We were missing a few spots where this should be done. It might be nice to generalize this somewhat at some point, as long as we factor in the fact that failure looks different for each request type. Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index bda27b52fd5b..4edc94aab17e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1672,6 +1672,8 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, } io_cqring_add_event(req->ctx, sqe->user_data, ret); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; io_put_req(req, nxt); return 0; } @@ -1787,6 +1789,8 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req->ctx, sqe->user_data, ret); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; io_put_req(req, NULL); return 0; } @@ -1994,6 +1998,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_put_req(req, NULL); return HRTIMER_NORESTART; } @@ -2035,6 +2041,8 @@ static int io_timeout_remove(struct io_kiocb *req, io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_put_req(req, NULL); return 0; } @@ -2328,6 +2336,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_put_req(req, NULL); if (ret) { + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(ctx, sqe->user_data, ret); io_put_req(req, NULL); } From 3aa5fa030558e2b0da284fd069aeb7178543c987 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Nov 2019 20:34:32 -0700 Subject: [PATCH 29/64] io_uring: kill dead REQ_F_LINK_DONE flag We had no more use for this flag after the conversion to io-wq, kill it off. Fixes: 561fb04a6a22 ("io_uring: replace workqueue usage with io-wq") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4edc94aab17e..cfdb51dd669e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -329,7 +329,6 @@ struct io_kiocb { #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_LINK 64 /* linked sqes */ -#define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ #define REQ_F_TIMEOUT 1024 /* timeout request */ @@ -731,7 +730,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) nxt->flags |= REQ_F_LINK; } - nxt->flags |= REQ_F_LINK_DONE; /* * If we're in async work, we can continue processing the chain * in this context instead of having to queue up new async work. From ae9428ca61271b6b7f52ebbc359676c9fdfde523 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 6 Nov 2019 00:22:14 +0300 Subject: [PATCH 30/64] io_uring: Merge io_submit_sqes and io_ring_submit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit io_submit_sqes() and io_ring_submit() are doing the same stuff with a little difference. Deduplicate them. Reviewed-by:Bob Liu Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 91 ++++++++++----------------------------------------- 1 file changed, 18 insertions(+), 73 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cfdb51dd669e..375c09a43d32 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2694,7 +2694,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - struct mm_struct **mm) + struct file *ring_file, int ring_fd, + struct mm_struct **mm, bool async) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; @@ -2745,10 +2746,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } out: + s.ring_file = ring_file; + s.ring_fd = ring_fd; s.has_user = *mm != NULL; - s.in_async = true; - s.needs_fixed_file = true; - trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, true); + s.in_async = async; + s.needs_fixed_file = async; + trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, async); io_submit_sqe(ctx, &s, statep, &link); submitted++; } @@ -2758,6 +2761,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (statep) io_submit_state_end(&state); + /* Commit SQ ring head once we've consumed and submitted all SQEs */ + io_commit_sqring(ctx); + return submitted; } @@ -2862,10 +2868,8 @@ static int io_sq_thread(void *data) } to_submit = min(to_submit, ctx->sq_entries); - inflight += io_submit_sqes(ctx, to_submit, &cur_mm); - - /* Commit SQ ring head once we've consumed all SQEs */ - io_commit_sqring(ctx); + inflight += io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, + true); } set_fs(old_fs); @@ -2879,69 +2883,6 @@ static int io_sq_thread(void *data) return 0; } -static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, - struct file *ring_file, int ring_fd) -{ - struct io_submit_state state, *statep = NULL; - struct io_kiocb *link = NULL; - struct io_kiocb *shadow_req = NULL; - bool prev_was_link = false; - int i, submit = 0; - - if (to_submit > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, ctx, to_submit); - statep = &state; - } - - for (i = 0; i < to_submit; i++) { - struct sqe_submit s; - - if (!io_get_sqring(ctx, &s)) - break; - - /* - * If previous wasn't linked and we have a linked command, - * that's the end of the chain. Submit the previous link. - */ - if (!prev_was_link && link) { - io_queue_link_head(ctx, link, &link->submit, shadow_req); - link = NULL; - shadow_req = NULL; - } - prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; - - if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { - if (!shadow_req) { - shadow_req = io_get_req(ctx, NULL); - if (unlikely(!shadow_req)) - goto out; - shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); - refcount_dec(&shadow_req->refs); - } - shadow_req->sequence = s.sequence; - } - -out: - s.ring_file = ring_file; - s.has_user = true; - s.in_async = false; - s.needs_fixed_file = false; - s.ring_fd = ring_fd; - submit++; - trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, false); - io_submit_sqe(ctx, &s, statep, &link); - } - - if (link) - io_queue_link_head(ctx, link, &link->submit, shadow_req); - if (statep) - io_submit_state_end(statep); - - io_commit_sqring(ctx); - - return submit; -} - struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; @@ -4062,10 +4003,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, wake_up(&ctx->sqo_wait); submitted = to_submit; } else if (to_submit) { - to_submit = min(to_submit, ctx->sq_entries); + struct mm_struct *cur_mm; + to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); - submitted = io_ring_submit(ctx, to_submit, f.file, fd); + /* already have mm, so io_submit_sqes() won't try to grab it */ + cur_mm = ctx->sqo_mm; + submitted = io_submit_sqes(ctx, to_submit, f.file, fd, + &cur_mm, false); mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { From e5eb6366ac2d1df8ad5b010718ac1997ceae45be Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 6 Nov 2019 00:22:15 +0300 Subject: [PATCH 31/64] io_uring: io_queue_link*() right after submit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a call to io_submit_sqe(), it's already known whether it needs to queue a link or not. Do it there, as it's simplier and doesn't keep an extra variable across the loop. Reviewed-by:Bob Liu Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 375c09a43d32..6524898831e0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2700,7 +2700,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; struct io_kiocb *shadow_req = NULL; - bool prev_was_link = false; int i, submitted = 0; bool mm_fault = false; @@ -2723,17 +2722,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - /* - * If previous wasn't linked and we have a linked command, - * that's the end of the chain. Submit the previous link. - */ - if (!prev_was_link && link) { - io_queue_link_head(ctx, link, &link->submit, shadow_req); - link = NULL; - shadow_req = NULL; - } - prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; - if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); @@ -2754,6 +2742,16 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, async); io_submit_sqe(ctx, &s, statep, &link); submitted++; + + /* + * If previous wasn't linked and we have a linked command, + * that's the end of the chain. Submit the previous link. + */ + if (!(s.sqe->flags & IOSQE_IO_LINK) && link) { + io_queue_link_head(ctx, link, &link->submit, shadow_req); + link = NULL; + shadow_req = NULL; + } } if (link) From 196be95cd5572078be9deb81cbea145fab246029 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Nov 2019 01:41:06 +0300 Subject: [PATCH 32/64] io_uring: allocate io_kiocb upfront Let io_submit_sqes() to allocate io_kiocb before fetching an sqe. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6524898831e0..d10216dd02bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2551,30 +2551,23 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) -static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, - struct io_submit_state *state, struct io_kiocb **link) +static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + struct sqe_submit *s, struct io_submit_state *state, + struct io_kiocb **link) { struct io_uring_sqe *sqe_copy; - struct io_kiocb *req; int ret; /* enforce forwards compatibility on users */ if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; - goto err; - } - - req = io_get_req(ctx, state); - if (unlikely(!req)) { - ret = -EAGAIN; - goto err; + goto err_req; } ret = io_req_set_file(ctx, s, state, req); if (unlikely(ret)) { err_req: io_free_req(req, NULL); -err: io_cqring_add_event(ctx, s->sqe->user_data, ret); return; } @@ -2710,9 +2703,18 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, for (i = 0; i < nr; i++) { struct sqe_submit s; + struct io_kiocb *req; - if (!io_get_sqring(ctx, &s)) + req = io_get_req(ctx, statep); + if (unlikely(!req)) { + if (!submitted) + submitted = -EAGAIN; break; + } + if (!io_get_sqring(ctx, &s)) { + __io_free_req(req); + break; + } if (io_sqe_needs_user(s.sqe) && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); @@ -2740,7 +2742,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, s.in_async = async; s.needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, async); - io_submit_sqe(ctx, &s, statep, &link); + io_submit_sqe(ctx, req, &s, statep, &link); submitted++; /* From 50585b9a07367b92382c1e975265344daeba78cd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Nov 2019 01:41:07 +0300 Subject: [PATCH 33/64] io_uring: Use submit info inlined into req Stack allocated struct sqe_submit is passed down to the submission path along with a request (a.k.a. struct io_kiocb), and will be copied into req->submit for async requests. As space for it is already allocated, fill req->submit in the first place instead of using on-stack one. As a result: 1. sqe->submit is the only place for sqe_submit and is always valid, so we don't need to track which one to use. 2. don't need to copy in case of async 3. allows to simplify the code by not carrying it as an argument all the way down 4. allows to reduce number of function arguments / potentially improve spilling The downside is that stack is most probably be cached, that's not true for just allocated memory for a request. Another concern is cache pollution. Though, a request would be touched and fetched along with req->submit at some point anyway, so shouldn't be a problem. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d10216dd02bc..2b48a79848f2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2456,7 +2456,6 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (sqe_copy) { s->sqe = sqe_copy; - memcpy(&req->submit, s, sizeof(*s)); if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { ret = io_grab_files(ctx, req); if (ret) { @@ -2591,13 +2590,11 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, } s->sqe = sqe_copy; - memcpy(&req->submit, s, sizeof(*s)); trace_io_uring_link(ctx, req, prev); list_add_tail(&req->list, &prev->link_list); } else if (s->sqe->flags & IOSQE_IO_LINK) { req->flags |= REQ_F_LINK; - memcpy(&req->submit, s, sizeof(*s)); INIT_LIST_HEAD(&req->link_list); *link = req; } else { @@ -2702,8 +2699,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } for (i = 0; i < nr; i++) { - struct sqe_submit s; struct io_kiocb *req; + unsigned int sqe_flags; req = io_get_req(ctx, statep); if (unlikely(!req)) { @@ -2711,12 +2708,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, &s)) { + if (!io_get_sqring(ctx, &req->submit)) { __io_free_req(req); break; } - if (io_sqe_needs_user(s.sqe) && !*mm) { + if (io_sqe_needs_user(req->submit.sqe) && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -2724,7 +2721,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { + sqe_flags = req->submit.sqe->flags; + + if (link && (sqe_flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); if (unlikely(!shadow_req)) @@ -2732,24 +2731,25 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } - shadow_req->sequence = s.sequence; + shadow_req->sequence = req->submit.sequence; } out: - s.ring_file = ring_file; - s.ring_fd = ring_fd; - s.has_user = *mm != NULL; - s.in_async = async; - s.needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, async); - io_submit_sqe(ctx, req, &s, statep, &link); + req->submit.ring_file = ring_file; + req->submit.ring_fd = ring_fd; + req->submit.has_user = *mm != NULL; + req->submit.in_async = async; + req->submit.needs_fixed_file = async; + trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data, + true, async); + io_submit_sqe(ctx, req, &req->submit, statep, &link); submitted++; /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ - if (!(s.sqe->flags & IOSQE_IO_LINK) && link) { + if (!(sqe_flags & IOSQE_IO_LINK) && link) { io_queue_link_head(ctx, link, &link->submit, shadow_req); link = NULL; shadow_req = NULL; From 267bc90442aa47002e2991f7d9dd141e168b466b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Nov 2019 01:41:08 +0300 Subject: [PATCH 34/64] io_uring: use inlined struct sqe_submit req->submit is always up-to-date, use it directly Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 87 +++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b48a79848f2..ee33c7020828 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1155,10 +1155,9 @@ static bool io_file_supports_async(struct file *file) return false; } -static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock) +static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) { - const struct io_uring_sqe *sqe = s->sqe; + const struct io_uring_sqe *sqe = req->submit.sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; unsigned ioprio; @@ -1406,8 +1405,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } -static int io_read(struct io_kiocb *req, const struct sqe_submit *s, - struct io_kiocb **nxt, bool force_nonblock) +static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -1416,7 +1415,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; ssize_t read_size, ret; - ret = io_prep_rw(req, s, force_nonblock); + ret = io_prep_rw(req, force_nonblock); if (ret) return ret; file = kiocb->ki_filp; @@ -1424,7 +1423,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); + ret = io_import_iovec(req->ctx, READ, &req->submit, &iovec, &iter); if (ret < 0) return ret; @@ -1456,7 +1455,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) - kiocb_done(kiocb, ret2, nxt, s->in_async); + kiocb_done(kiocb, ret2, nxt, req->submit.in_async); else ret = -EAGAIN; } @@ -1464,8 +1463,8 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, return ret; } -static int io_write(struct io_kiocb *req, const struct sqe_submit *s, - struct io_kiocb **nxt, bool force_nonblock) +static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -1474,7 +1473,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; ssize_t ret; - ret = io_prep_rw(req, s, force_nonblock); + ret = io_prep_rw(req, force_nonblock); if (ret) return ret; @@ -1482,7 +1481,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; - ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); + ret = io_import_iovec(req->ctx, WRITE, &req->submit, &iovec, &iter); if (ret < 0) return ret; @@ -1519,7 +1518,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) - kiocb_done(kiocb, ret2, nxt, s->in_async); + kiocb_done(kiocb, ret2, nxt, req->submit.in_async); else ret = -EAGAIN; } @@ -2188,9 +2187,9 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->submit.sqe; struct io_uring_sqe *sqe_copy; if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) @@ -2217,10 +2216,10 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, } static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct sqe_submit *s, struct io_kiocb **nxt, - bool force_nonblock) + struct io_kiocb **nxt, bool force_nonblock) { int ret, opcode; + struct sqe_submit *s = &req->submit; req->user_data = READ_ONCE(s->sqe->user_data); @@ -2232,18 +2231,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_READV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_read(req, s, nxt, force_nonblock); + ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITEV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_write(req, s, nxt, force_nonblock); + ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_READ_FIXED: - ret = io_read(req, s, nxt, force_nonblock); + ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITE_FIXED: - ret = io_write(req, s, nxt, force_nonblock); + ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: ret = io_fsync(req, s->sqe, nxt, force_nonblock); @@ -2318,7 +2317,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; s->in_async = true; do { - ret = __io_submit_sqe(ctx, req, s, &nxt, false); + ret = __io_submit_sqe(ctx, req, &nxt, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -2372,9 +2371,10 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK]; } -static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, +static int io_req_set_file(struct io_ring_ctx *ctx, struct io_submit_state *state, struct io_kiocb *req) { + struct sqe_submit *s = &req->submit; unsigned flags; int fd; @@ -2438,12 +2438,11 @@ static int io_grab_files(struct io_ring_ctx *ctx, struct io_kiocb *req) return ret; } -static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s) +static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) { int ret; - ret = __io_submit_sqe(ctx, req, s, NULL, true); + ret = __io_submit_sqe(ctx, req, NULL, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -2451,6 +2450,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { + struct sqe_submit *s = &req->submit; struct io_uring_sqe *sqe_copy; sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); @@ -2488,31 +2488,30 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return ret; } -static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s) +static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) { int ret; - ret = io_req_defer(ctx, req, s->sqe); + ret = io_req_defer(ctx, req); if (ret) { if (ret != -EIOCBQUEUED) { + io_cqring_add_event(ctx, req->submit.sqe->user_data, ret); io_free_req(req, NULL); - io_cqring_add_event(ctx, s->sqe->user_data, ret); } return 0; } - return __io_queue_sqe(ctx, req, s); + return __io_queue_sqe(ctx, req); } static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, struct io_kiocb *shadow) + struct io_kiocb *shadow) { int ret; int need_submit = false; if (!shadow) - return io_queue_sqe(ctx, req, s); + return io_queue_sqe(ctx, req); /* * Mark the first IO in link list as DRAIN, let all the following @@ -2520,12 +2519,12 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, * list. */ req->flags |= REQ_F_IO_DRAIN; - ret = io_req_defer(ctx, req, s->sqe); + ret = io_req_defer(ctx, req); if (ret) { if (ret != -EIOCBQUEUED) { + io_cqring_add_event(ctx, req->submit.sqe->user_data, ret); io_free_req(req, NULL); __io_free_req(shadow); - io_cqring_add_event(ctx, s->sqe->user_data, ret); return 0; } } else { @@ -2543,7 +2542,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, spin_unlock_irq(&ctx->completion_lock); if (need_submit) - return __io_queue_sqe(ctx, req, s); + return __io_queue_sqe(ctx, req); return 0; } @@ -2551,10 +2550,10 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, struct io_submit_state *state, - struct io_kiocb **link) + struct io_submit_state *state, struct io_kiocb **link) { struct io_uring_sqe *sqe_copy; + struct sqe_submit *s = &req->submit; int ret; /* enforce forwards compatibility on users */ @@ -2563,11 +2562,11 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, goto err_req; } - ret = io_req_set_file(ctx, s, state, req); + ret = io_req_set_file(ctx, state, req); if (unlikely(ret)) { err_req: - io_free_req(req, NULL); io_cqring_add_event(ctx, s->sqe->user_data, ret); + io_free_req(req, NULL); return; } @@ -2598,7 +2597,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, INIT_LIST_HEAD(&req->link_list); *link = req; } else { - io_queue_sqe(ctx, req, s); + io_queue_sqe(ctx, req); } } @@ -2742,7 +2741,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->submit.needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data, true, async); - io_submit_sqe(ctx, req, &req->submit, statep, &link); + io_submit_sqe(ctx, req, statep, &link); submitted++; /* @@ -2750,14 +2749,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, * that's the end of the chain. Submit the previous link. */ if (!(sqe_flags & IOSQE_IO_LINK) && link) { - io_queue_link_head(ctx, link, &link->submit, shadow_req); + io_queue_link_head(ctx, link, shadow_req); link = NULL; shadow_req = NULL; } } if (link) - io_queue_link_head(ctx, link, &link->submit, shadow_req); + io_queue_link_head(ctx, link, shadow_req); if (statep) io_submit_state_end(&state); From e977d6d34f0c08e3c3b132c9e73b98d0db50abc1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Nov 2019 12:39:45 -0700 Subject: [PATCH 35/64] io_uring: abstract out io_async_cancel_one() helper We're going to need this helper in a future patch, so move it out of io_async_cancel() and into its own separate function. No functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ee33c7020828..5360c3dd262b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2152,21 +2152,11 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data) return req->user_data == (unsigned long) data; } -static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) { - struct io_ring_ctx *ctx = req->ctx; enum io_wq_cancel cancel_ret; - void *sqe_addr; int ret = 0; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || - sqe->cancel_flags) - return -EINVAL; - - sqe_addr = (void *) (unsigned long) READ_ONCE(sqe->addr); cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr); switch (cancel_ret) { case IO_WQ_CANCEL_OK: @@ -2180,6 +2170,25 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } + return ret; +} + +static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt) +{ + struct io_ring_ctx *ctx = req->ctx; + void *sqe_addr; + int ret; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || + sqe->cancel_flags) + return -EINVAL; + + sqe_addr = (void *) (unsigned long) READ_ONCE(sqe->addr); + ret = io_async_cancel_one(ctx, sqe_addr); + if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req->ctx, sqe->user_data, ret); From 2665abfd757fb35a241c6f0b1ebf620e3ffb36fb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Nov 2019 12:40:47 -0700 Subject: [PATCH 36/64] io_uring: add support for linked SQE timeouts While we have support for generic timeouts, we don't have a way to tie a timeout to a specific SQE. The generic timeouts simply trigger wakeups on the CQ ring. This adds support for IORING_OP_LINK_TIMEOUT. This command is only valid as a link to a previous command. The timeout specific can be either relative or absolute, following the same rules as IORING_OP_TIMEOUT. If the timeout triggers before the dependent command completes, it will attempt to cancel that command. Likewise, if the dependent command completes before the timeout triggers, it will cancel the timeout. Signed-off-by: Jens Axboe --- fs/io_uring.c | 191 ++++++++++++++++++++++++++++++++-- include/uapi/linux/io_uring.h | 1 + 2 files changed, 181 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5360c3dd262b..eadd19ab39a8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -329,6 +329,7 @@ struct io_kiocb { #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_LINK 64 /* linked sqes */ +#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ #define REQ_F_TIMEOUT 1024 /* timeout request */ @@ -371,6 +372,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, long res); static void __io_free_req(struct io_kiocb *req); +static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr); static struct kmem_cache *req_cachep; @@ -712,9 +714,28 @@ static void __io_free_req(struct io_kiocb *req) kmem_cache_free(req_cachep, req); } +static bool io_link_cancel_timeout(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + int ret; + + ret = hrtimer_try_to_cancel(&req->timeout.timer); + if (ret != -1) { + io_cqring_fill_event(ctx, req->user_data, -ECANCELED); + io_commit_cqring(ctx); + req->flags &= ~REQ_F_LINK; + __io_free_req(req); + return true; + } + + return false; +} + static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { + struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt; + bool wake_ev = false; /* * The list should never be empty when we are called here. But could @@ -722,7 +743,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * safe side. */ nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - if (nxt) { + while (nxt) { list_del(&nxt->list); if (!list_empty(&req->link_list)) { INIT_LIST_HEAD(&nxt->link_list); @@ -734,11 +755,23 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * If we're in async work, we can continue processing the chain * in this context instead of having to queue up new async work. */ - if (nxtptr && current_work()) + if (req->flags & REQ_F_LINK_TIMEOUT) { + wake_ev = io_link_cancel_timeout(ctx, nxt); + + /* we dropped this link, get next */ + nxt = list_first_entry_or_null(&req->link_list, + struct io_kiocb, list); + } else if (nxtptr && current_work()) { *nxtptr = nxt; - else + break; + } else { io_queue_async_work(req->ctx, nxt); + break; + } } + + if (wake_ev) + io_cqring_ev_posted(ctx); } /* @@ -746,31 +779,61 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) */ static void io_fail_links(struct io_kiocb *req) { + struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link; + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { link = list_first_entry(&req->link_list, struct io_kiocb, list); - list_del(&link->list); + list_del_init(&link->list); trace_io_uring_fail_link(req, link); - io_cqring_add_event(req->ctx, link->user_data, -ECANCELED); - __io_free_req(link); + + if ((req->flags & REQ_F_LINK_TIMEOUT) && + link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { + io_link_cancel_timeout(ctx, link); + } else { + io_cqring_fill_event(ctx, link->user_data, -ECANCELED); + __io_free_req(link); + } } + + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); } static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt) { + if (likely(!(req->flags & REQ_F_LINK))) { + __io_free_req(req); + return; + } + /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_LINK) { - if (req->flags & REQ_F_FAIL_LINK) - io_fail_links(req); - else - io_req_link_next(req, nxt); + if (req->flags & REQ_F_FAIL_LINK) { + io_fail_links(req); + } else if (req->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + /* + * If this is a timeout link, we could be racing with the + * timeout timer. Grab the completion lock for this case to + * protection against that. + */ + spin_lock_irqsave(&ctx->completion_lock, flags); + io_req_link_next(req, nxt); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + io_req_link_next(req, nxt); } __io_free_req(req); @@ -2447,10 +2510,112 @@ static int io_grab_files(struct io_ring_ctx *ctx, struct io_kiocb *req) return ret; } +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) +{ + struct io_kiocb *req = container_of(timer, struct io_kiocb, + timeout.timer); + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *prev = NULL; + unsigned long flags; + int ret = -ETIME; + + spin_lock_irqsave(&ctx->completion_lock, flags); + + /* + * We don't expect the list to be empty, that will only happen if we + * race with the completion of the linked work. + */ + if (!list_empty(&req->list)) { + prev = list_entry(req->list.prev, struct io_kiocb, link_list); + list_del_init(&req->list); + } + + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + if (prev) { + void *user_data = (void *) (unsigned long) prev->user_data; + ret = io_async_cancel_one(ctx, user_data); + } + + io_cqring_add_event(ctx, req->user_data, ret); + io_put_req(req, NULL); + return HRTIMER_NORESTART; +} + +static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt) +{ + const struct io_uring_sqe *sqe = nxt->submit.sqe; + enum hrtimer_mode mode; + struct timespec64 ts; + int ret = -EINVAL; + + if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off) + goto err; + if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS) + goto err; + if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) { + ret = -EFAULT; + goto err; + } + + req->flags |= REQ_F_LINK_TIMEOUT; + + if (sqe->timeout_flags & IORING_TIMEOUT_ABS) + mode = HRTIMER_MODE_ABS; + else + mode = HRTIMER_MODE_REL; + hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, mode); + nxt->timeout.timer.function = io_link_timeout_fn; + hrtimer_start(&nxt->timeout.timer, timespec64_to_ktime(ts), mode); + ret = 0; +err: + /* drop submission reference */ + io_put_req(nxt, NULL); + + if (ret) { + struct io_ring_ctx *ctx = req->ctx; + + /* + * Break the link and fail linked timeout, parent will get + * failed by the regular submission path. + */ + list_del(&nxt->list); + io_cqring_fill_event(ctx, nxt->user_data, ret); + trace_io_uring_fail_link(req, nxt); + io_commit_cqring(ctx); + io_put_req(nxt, NULL); + ret = -ECANCELED; + } + + return ret; +} + +static inline struct io_kiocb *io_get_linked_timeout(struct io_kiocb *req) +{ + struct io_kiocb *nxt; + + if (!(req->flags & REQ_F_LINK)) + return NULL; + + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); + if (nxt && nxt->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) + return nxt; + + return NULL; +} + static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) { + struct io_kiocb *nxt; int ret; + nxt = io_get_linked_timeout(req); + if (unlikely(nxt)) { + ret = io_queue_linked_timeout(req, nxt); + if (ret) + goto err; + } + ret = __io_submit_sqe(ctx, req, NULL, true); /* @@ -2605,6 +2770,10 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, INIT_LIST_HEAD(&req->link_list); *link = req; + } else if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { + /* Only valid as a linked SQE */ + ret = -EINVAL; + goto err_req; } else { io_queue_sqe(ctx, req); } diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6877cf8894db..f1a118b01d18 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -72,6 +72,7 @@ struct io_uring_sqe { #define IORING_OP_TIMEOUT_REMOVE 12 #define IORING_OP_ACCEPT 13 #define IORING_OP_ASYNC_CANCEL 14 +#define IORING_OP_LINK_TIMEOUT 15 /* * sqe->fsync_flags From 84f97dc2333c626979bb547fce343a1003544dcc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Nov 2019 11:27:53 -0700 Subject: [PATCH 37/64] io_uring: make io_cqring_events() take 'ctx' as argument The rings can be derived from the ctx, and we need the ctx there for a future change. No functional changes in this patch. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index eadd19ab39a8..d8e15cce936e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -866,8 +866,10 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr) } } -static unsigned io_cqring_events(struct io_rings *rings) +static unsigned io_cqring_events(struct io_ring_ctx *ctx) { + struct io_rings *rings = ctx->rings; + /* See comment at the top of this file */ smp_rmb(); return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); @@ -1023,7 +1025,7 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (io_cqring_events(ctx->rings)) + if (io_cqring_events(ctx)) break; /* @@ -3076,7 +3078,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ - return io_cqring_events(ctx->rings) >= iowq->to_wait || + return io_cqring_events(ctx) >= iowq->to_wait || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } @@ -3111,7 +3113,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, struct io_rings *rings = ctx->rings; int ret = 0; - if (io_cqring_events(rings) >= min_events) + if (io_cqring_events(ctx) >= min_events) return 0; if (sig) { From 78e19bbef38362cebff38aa1ca12e2c82bb72eb8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Nov 2019 15:21:34 -0700 Subject: [PATCH 38/64] io_uring: pass in io_kiocb to fill/add CQ handlers This is in preparation for handling CQ ring overflow a bit smarter. We should not have any functional changes in this patch. Most of the changes are fairly straight forward, the only ones that stick out a bit are the ones that change __io_free_req() to take the reference count into account. If the request hasn't been submitted yet, we know it's safe to simply ignore references and free it. But let's clean these up too, as later patches will depend on the caller doing the right thing if the completion logging grabs a reference to the request. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 96 +++++++++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d8e15cce936e..91103fc9771d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -369,10 +369,10 @@ struct io_submit_state { }; static void io_wq_submit_work(struct io_wq_work **workptr); -static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, - long res); +static void io_cqring_fill_event(struct io_kiocb *req, long res); static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr); +static void io_double_put_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -535,8 +535,8 @@ static void io_kill_timeout(struct io_kiocb *req) if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); - io_cqring_fill_event(req->ctx, req->user_data, 0); - __io_free_req(req); + io_cqring_fill_event(req, 0); + io_put_req(req, NULL); } } @@ -588,12 +588,12 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) return &rings->cqes[tail & ctx->cq_mask]; } -static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, - long res) +static void io_cqring_fill_event(struct io_kiocb *req, long res) { + struct io_ring_ctx *ctx = req->ctx; struct io_uring_cqe *cqe; - trace_io_uring_complete(ctx, ki_user_data, res); + trace_io_uring_complete(ctx, req->user_data, res); /* * If we can't get a cq entry, userspace overflowed the @@ -602,7 +602,7 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, */ cqe = io_get_cqring(ctx); if (cqe) { - WRITE_ONCE(cqe->user_data, ki_user_data); + WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); } else { @@ -621,13 +621,13 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, - long res) +static void io_cqring_add_event(struct io_kiocb *req, long res) { + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(ctx, user_data, res); + io_cqring_fill_event(req, res); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -721,10 +721,10 @@ static bool io_link_cancel_timeout(struct io_ring_ctx *ctx, ret = hrtimer_try_to_cancel(&req->timeout.timer); if (ret != -1) { - io_cqring_fill_event(ctx, req->user_data, -ECANCELED); + io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); req->flags &= ~REQ_F_LINK; - __io_free_req(req); + io_put_req(req, NULL); return true; } @@ -795,8 +795,8 @@ static void io_fail_links(struct io_kiocb *req) link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(ctx, link); } else { - io_cqring_fill_event(ctx, link->user_data, -ECANCELED); - __io_free_req(link); + io_cqring_fill_event(link, -ECANCELED); + io_double_put_req(link); } } @@ -866,6 +866,13 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr) } } +static void io_double_put_req(struct io_kiocb *req) +{ + /* drop both submit and complete references */ + if (refcount_sub_and_test(2, &req->refs)) + __io_free_req(req); +} + static unsigned io_cqring_events(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; @@ -898,7 +905,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(ctx, req->user_data, req->result); + io_cqring_fill_event(req, req->result); (*nr_events)++; if (refcount_dec_and_test(&req->refs)) { @@ -1094,7 +1101,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req->ctx, req->user_data, res); + io_cqring_add_event(req, res); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -1595,15 +1602,14 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, /* * IORING_OP_NOP just posts a completion event, nothing else. */ -static int io_nop(struct io_kiocb *req, u64 user_data) +static int io_nop(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - long err = 0; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_cqring_add_event(ctx, user_data, err); + io_cqring_add_event(req, 0); io_put_req(req, NULL); return 0; } @@ -1650,7 +1656,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_cqring_add_event(req, ret); io_put_req(req, nxt); return 0; } @@ -1697,7 +1703,7 @@ static int io_sync_file_range(struct io_kiocb *req, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_cqring_add_event(req, ret); io_put_req(req, nxt); return 0; } @@ -1733,7 +1739,7 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, return ret; } - io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_cqring_add_event(req, ret); if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_put_req(req, nxt); @@ -1789,7 +1795,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, } if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_cqring_add_event(req, ret); io_put_req(req, nxt); return 0; #else @@ -1850,7 +1856,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) } spin_unlock_irq(&ctx->completion_lock); - io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_cqring_add_event(req, ret); if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_put_req(req, NULL); @@ -1861,7 +1867,7 @@ static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, __poll_t mask) { req->poll.done = true; - io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask)); + io_cqring_fill_event(req, mangle_poll(mask)); io_commit_cqring(ctx); } @@ -2055,7 +2061,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) list_del_init(&req->list); } - io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2099,7 +2105,7 @@ static int io_timeout_remove(struct io_kiocb *req, /* didn't find timeout */ if (ret) { fill_ev: - io_cqring_fill_event(ctx, req->user_data, ret); + io_cqring_fill_event(req, ret); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -2115,8 +2121,8 @@ static int io_timeout_remove(struct io_kiocb *req, goto fill_ev; } - io_cqring_fill_event(ctx, req->user_data, 0); - io_cqring_fill_event(ctx, treq->user_data, -ECANCELED); + io_cqring_fill_event(req, 0); + io_cqring_fill_event(treq, -ECANCELED); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -2256,7 +2262,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_cqring_add_event(req, ret); io_put_req(req, nxt); return 0; } @@ -2295,12 +2301,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, int ret, opcode; struct sqe_submit *s = &req->submit; - req->user_data = READ_ONCE(s->sqe->user_data); - opcode = READ_ONCE(s->sqe->opcode); switch (opcode) { case IORING_OP_NOP: - ret = io_nop(req, req->user_data); + ret = io_nop(req); break; case IORING_OP_READV: if (unlikely(s->sqe->buf_index)) @@ -2409,7 +2413,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (ret) { if (req->flags & REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(ctx, sqe->user_data, ret); + io_cqring_add_event(req, ret); io_put_req(req, NULL); } @@ -2539,7 +2543,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) ret = io_async_cancel_one(ctx, user_data); } - io_cqring_add_event(ctx, req->user_data, ret); + io_cqring_add_event(req, ret); io_put_req(req, NULL); return HRTIMER_NORESTART; } @@ -2582,7 +2586,7 @@ static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt) * failed by the regular submission path. */ list_del(&nxt->list); - io_cqring_fill_event(ctx, nxt->user_data, ret); + io_cqring_fill_event(nxt, ret); trace_io_uring_fail_link(req, nxt); io_commit_cqring(ctx); io_put_req(nxt, NULL); @@ -2655,7 +2659,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) /* and drop final reference, if we failed */ if (ret) { - io_cqring_add_event(ctx, req->user_data, ret); + io_cqring_add_event(req, ret); if (req->flags & REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; io_put_req(req, NULL); @@ -2671,8 +2675,8 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) ret = io_req_defer(ctx, req); if (ret) { if (ret != -EIOCBQUEUED) { - io_cqring_add_event(ctx, req->submit.sqe->user_data, ret); - io_free_req(req, NULL); + io_cqring_add_event(req, ret); + io_double_put_req(req); } return 0; } @@ -2698,8 +2702,8 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_req_defer(ctx, req); if (ret) { if (ret != -EIOCBQUEUED) { - io_cqring_add_event(ctx, req->submit.sqe->user_data, ret); - io_free_req(req, NULL); + io_cqring_add_event(req, ret); + io_double_put_req(req); __io_free_req(shadow); return 0; } @@ -2732,6 +2736,8 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, struct sqe_submit *s = &req->submit; int ret; + req->user_data = s->sqe->user_data; + /* enforce forwards compatibility on users */ if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; @@ -2741,13 +2747,11 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_req_set_file(ctx, state, req); if (unlikely(ret)) { err_req: - io_cqring_add_event(ctx, s->sqe->user_data, ret); - io_free_req(req, NULL); + io_cqring_add_event(req, ret); + io_double_put_req(req); return; } - req->user_data = s->sqe->user_data; - /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but From 1d7bb1d50fb4dc141c7431cc21fdd24ffcc83c76 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Nov 2019 11:31:17 -0700 Subject: [PATCH 39/64] io_uring: add support for backlogged CQ ring Currently we drop completion events, if the CQ ring is full. That's fine for requests with bounded completion times, but it may make it harder or impossible to use io_uring with networked IO where request completion times are generally unbounded. Or with POLL, for example, which is also unbounded. After this patch, we never overflow the ring, we simply store requests in a backlog for later flushing. This flushing is done automatically by the kernel. To prevent the backlog from growing indefinitely, if the backlog is non-empty, we apply back pressure on IO submissions. Any attempt to submit new IO with a non-empty backlog will get an -EBUSY return from the kernel. This is a signal to the application that it has backlogged CQ events, and that it must reap those before being allowed to submit more IO. Note that if we do return -EBUSY, we will have filled whatever backlogged events into the CQ ring first, if there's room. This means the application can safely reap events WITHOUT entering the kernel and waiting for them, they are already available in the CQ ring. Signed-off-by: Jens Axboe --- fs/io_uring.c | 125 +++++++++++++++++++++++++++------- include/uapi/linux/io_uring.h | 1 + 2 files changed, 103 insertions(+), 23 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 91103fc9771d..4d89a2f222bf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -185,6 +185,7 @@ struct io_ring_ctx { unsigned int flags; bool compat; bool account_mem; + bool cq_overflow_flushed; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -207,6 +208,7 @@ struct io_ring_ctx { struct list_head defer_list; struct list_head timeout_list; + struct list_head cq_overflow_list; wait_queue_head_t inflight_wait; } ____cacheline_aligned_in_smp; @@ -414,6 +416,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->flags = p->flags; init_waitqueue_head(&ctx->cq_wait); + INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ctx_done); init_completion(&ctx->sqo_thread_started); mutex_init(&ctx->uring_lock); @@ -588,6 +591,67 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) return &rings->cqes[tail & ctx->cq_mask]; } +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + if (waitqueue_active(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); + if (ctx->cq_ev_fd) + eventfd_signal(ctx->cq_ev_fd, 1); +} + +static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +{ + struct io_rings *rings = ctx->rings; + struct io_uring_cqe *cqe; + struct io_kiocb *req; + unsigned long flags; + LIST_HEAD(list); + + if (!force) { + if (list_empty_careful(&ctx->cq_overflow_list)) + return; + if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == + rings->cq_ring_entries)) + return; + } + + spin_lock_irqsave(&ctx->completion_lock, flags); + + /* if force is set, the ring is going away. always drop after that */ + if (force) + ctx->cq_overflow_flushed = true; + + while (!list_empty(&ctx->cq_overflow_list)) { + cqe = io_get_cqring(ctx); + if (!cqe && !force) + break; + + req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, + list); + list_move(&req->list, &list); + if (cqe) { + WRITE_ONCE(cqe->user_data, req->user_data); + WRITE_ONCE(cqe->res, req->result); + WRITE_ONCE(cqe->flags, 0); + } else { + WRITE_ONCE(ctx->rings->cq_overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); + } + } + + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); + + while (!list_empty(&list)) { + req = list_first_entry(&list, struct io_kiocb, list); + list_del(&req->list); + io_put_req(req, NULL); + } +} + static void io_cqring_fill_event(struct io_kiocb *req, long res) { struct io_ring_ctx *ctx = req->ctx; @@ -601,26 +665,20 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) * the ring. */ cqe = io_get_cqring(ctx); - if (cqe) { + if (likely(cqe)) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); - } else { + } else if (ctx->cq_overflow_flushed) { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + } else { + refcount_inc(&req->refs); + req->result = res; + list_add_tail(&req->list, &ctx->cq_overflow_list); } } -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) -{ - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); - if (waitqueue_active(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); - if (ctx->cq_ev_fd) - eventfd_signal(ctx->cq_ev_fd, 1); -} - static void io_cqring_add_event(struct io_kiocb *req, long res) { struct io_ring_ctx *ctx = req->ctx; @@ -873,10 +931,20 @@ static void io_double_put_req(struct io_kiocb *req) __io_free_req(req); } -static unsigned io_cqring_events(struct io_ring_ctx *ctx) +static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { struct io_rings *rings = ctx->rings; + /* + * noflush == true is from the waitqueue handler, just ensure we wake + * up the task, and the next invocation will flush the entries. We + * cannot safely to it from here. + */ + if (noflush && !list_empty(&ctx->cq_overflow_list)) + return -1U; + + io_cqring_overflow_flush(ctx, false); + /* See comment at the top of this file */ smp_rmb(); return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); @@ -1032,7 +1100,7 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (io_cqring_events(ctx)) + if (io_cqring_events(ctx, false)) break; /* @@ -2876,6 +2944,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, int i, submitted = 0; bool mm_fault = false; + if (!list_empty(&ctx->cq_overflow_list)) { + io_cqring_overflow_flush(ctx, false); + return -EBUSY; + } + if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, nr); statep = &state; @@ -2967,6 +3040,7 @@ static int io_sq_thread(void *data) timeout = inflight = 0; while (!kthread_should_park()) { unsigned int to_submit; + int ret; if (inflight) { unsigned nr_events = 0; @@ -3051,8 +3125,9 @@ static int io_sq_thread(void *data) } to_submit = min(to_submit, ctx->sq_entries); - inflight += io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, - true); + ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + if (ret > 0) + inflight += ret; } set_fs(old_fs); @@ -3073,7 +3148,7 @@ struct io_wait_queue { unsigned nr_timeouts; }; -static inline bool io_should_wake(struct io_wait_queue *iowq) +static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush) { struct io_ring_ctx *ctx = iowq->ctx; @@ -3082,7 +3157,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ - return io_cqring_events(ctx) >= iowq->to_wait || + return io_cqring_events(ctx, noflush) >= iowq->to_wait || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } @@ -3092,7 +3167,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); - if (!io_should_wake(iowq)) + /* use noflush == true, as we can't safely rely on locking context */ + if (!io_should_wake(iowq, true)) return -1; return autoremove_wake_function(curr, mode, wake_flags, key); @@ -3117,7 +3193,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, struct io_rings *rings = ctx->rings; int ret = 0; - if (io_cqring_events(ctx) >= min_events) + if (io_cqring_events(ctx, false) >= min_events) return 0; if (sig) { @@ -3138,7 +3214,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); - if (io_should_wake(&iowq)) + if (io_should_wake(&iowq, false)) break; schedule(); if (signal_pending(current)) { @@ -4061,6 +4137,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_wq_cancel_all(ctx->io_wq); io_iopoll_reap_events(ctx); + io_cqring_overflow_flush(ctx, true); wait_for_completion(&ctx->ctx_done); io_ring_ctx_free(ctx); } @@ -4116,8 +4193,10 @@ static int io_uring_flush(struct file *file, void *data) struct io_ring_ctx *ctx = file->private_data; io_uring_cancel_files(ctx, data); - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { + io_cqring_overflow_flush(ctx, true); io_wq_cancel_all(ctx->io_wq); + } return 0; } @@ -4391,7 +4470,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP; + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f1a118b01d18..2a1569211d87 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -155,6 +155,7 @@ struct io_uring_params { * io_uring_params->features flags */ #define IORING_FEAT_SINGLE_MMAP (1U << 0) +#define IORING_FEAT_NODROP (1U << 1) /* * io_uring_register(2) opcodes and arguments From 91d666ea43adef57a6cd50c81b9603c545654981 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Nov 2019 09:17:36 -0700 Subject: [PATCH 40/64] io-wq: io_wqe_run_queue() doesn't need to use list_empty_careful() We hold the wqe lock at this point (which is also annotated), so there's no need to use the careful variant of list_empty(). Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index ba40a7ee31c3..9b375009a553 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -338,8 +338,7 @@ static void io_worker_handle_work(struct io_worker *worker) static inline bool io_wqe_run_queue(struct io_wqe *wqe) __must_hold(wqe->lock) { - if (!list_empty_careful(&wqe->work_list) && - !(wqe->flags & IO_WQE_FLAG_STALLED)) + if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED)) return true; return false; } From c5def4ab849494d3c97f6c9fc84b2ddb868fe78c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Nov 2019 11:41:16 -0700 Subject: [PATCH 41/64] io-wq: add support for bounded vs unbunded work io_uring supports request types that basically have two different lifetimes: 1) Bounded completion time. These are requests like disk reads or writes, which we know will finish in a finite amount of time. 2) Unbounded completion time. These are generally networked IO, where we have no idea how long they will take to complete. Another example is POLL commands. This patch provides support for io-wq to handle these differently, so we don't starve bounded requests by tying up workers for too long. By default all work is bounded, unless otherwise specified in the work item. Signed-off-by: Jens Axboe --- fs/io-wq.c | 296 ++++++++++++++++++++++++++++++++++++-------------- fs/io-wq.h | 4 +- fs/io_uring.c | 2 +- 3 files changed, 220 insertions(+), 82 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 9b375009a553..33b14b85752b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -27,6 +27,7 @@ enum { IO_WORKER_F_FREE = 4, /* worker on free list */ IO_WORKER_F_EXITING = 8, /* worker exiting */ IO_WORKER_F_FIXED = 16, /* static idle worker */ + IO_WORKER_F_BOUND = 32, /* is doing bounded work */ }; enum { @@ -66,6 +67,17 @@ struct io_wq_nulls_list { #define IO_WQ_HASH_ORDER 5 #endif +struct io_wqe_acct { + unsigned nr_workers; + unsigned max_workers; + atomic_t nr_running; +}; + +enum { + IO_WQ_ACCT_BOUND, + IO_WQ_ACCT_UNBOUND, +}; + /* * Per-node worker thread pool */ @@ -78,9 +90,7 @@ struct io_wqe { } ____cacheline_aligned_in_smp; int node; - unsigned nr_workers; - unsigned max_workers; - atomic_t nr_running; + struct io_wqe_acct acct[2]; struct io_wq_nulls_list free_list; struct io_wq_nulls_list busy_list; @@ -97,6 +107,7 @@ struct io_wq { unsigned nr_wqes; struct task_struct *manager; + struct user_struct *user; struct mm_struct *mm; refcount_t refs; struct completion done; @@ -152,10 +163,29 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) return dropped_lock; } +static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, + struct io_wq_work *work) +{ + if (work->flags & IO_WQ_WORK_UNBOUND) + return &wqe->acct[IO_WQ_ACCT_UNBOUND]; + + return &wqe->acct[IO_WQ_ACCT_BOUND]; +} + +static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe, + struct io_worker *worker) +{ + if (worker->flags & IO_WORKER_F_BOUND) + return &wqe->acct[IO_WQ_ACCT_BOUND]; + + return &wqe->acct[IO_WQ_ACCT_UNBOUND]; +} + static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; - bool all_done = false; + struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + unsigned nr_workers; /* * If we're not at zero, someone else is holding a brief reference @@ -169,7 +199,9 @@ static void io_worker_exit(struct io_worker *worker) preempt_disable(); current->flags &= ~PF_IO_WORKER; if (worker->flags & IO_WORKER_F_RUNNING) - atomic_dec(&wqe->nr_running); + atomic_dec(&acct->nr_running); + if (!(worker->flags & IO_WORKER_F_BOUND)) + atomic_dec(&wqe->wq->user->processes); worker->flags = 0; preempt_enable(); @@ -179,17 +211,88 @@ static void io_worker_exit(struct io_worker *worker) __release(&wqe->lock); spin_lock_irq(&wqe->lock); } - wqe->nr_workers--; - all_done = !wqe->nr_workers; + acct->nr_workers--; + nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + + wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; spin_unlock_irq(&wqe->lock); /* all workers gone, wq exit can proceed */ - if (all_done && refcount_dec_and_test(&wqe->wq->refs)) + if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) complete(&wqe->wq->done); kfree_rcu(worker, rcu); } +static inline bool io_wqe_run_queue(struct io_wqe *wqe) + __must_hold(wqe->lock) +{ + if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED)) + return true; + return false; +} + +/* + * Check head of free list for an available worker. If one isn't available, + * caller must wake up the wq manager to create one. + */ +static bool io_wqe_activate_free_worker(struct io_wqe *wqe) + __must_hold(RCU) +{ + struct hlist_nulls_node *n; + struct io_worker *worker; + + n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list.head)); + if (is_a_nulls(n)) + return false; + + worker = hlist_nulls_entry(n, struct io_worker, nulls_node); + if (io_worker_get(worker)) { + wake_up(&worker->wait); + io_worker_release(worker); + return true; + } + + return false; +} + +/* + * We need a worker. If we find a free one, we're good. If not, and we're + * below the max number of workers, wake up the manager to create one. + */ +static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) +{ + bool ret; + + /* + * Most likely an attempt to queue unbounded work on an io_wq that + * wasn't setup with any unbounded workers. + */ + WARN_ON_ONCE(!acct->max_workers); + + rcu_read_lock(); + ret = io_wqe_activate_free_worker(wqe); + rcu_read_unlock(); + + if (!ret && acct->nr_workers < acct->max_workers) + wake_up_process(wqe->wq->manager); +} + +static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker) +{ + struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + + atomic_inc(&acct->nr_running); +} + +static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker) + __must_hold(wqe->lock) +{ + struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + + if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) + io_wqe_wake_worker(wqe, acct); +} + static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) { allow_kernel_signal(SIGINT); @@ -198,7 +301,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->restore_files = current->files; - atomic_inc(&wqe->nr_running); + io_wqe_inc_running(wqe, worker); } /* @@ -209,6 +312,8 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, struct io_wq_work *work) __must_hold(wqe->lock) { + bool worker_bound, work_bound; + if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; hlist_nulls_del_init_rcu(&worker->nulls_node); @@ -216,6 +321,28 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, &wqe->busy_list.head); } worker->cur_work = work; + + /* + * If worker is moving from bound to unbound (or vice versa), then + * ensure we update the running accounting. + */ + worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; + work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; + if (worker_bound != work_bound) { + io_wqe_dec_running(wqe, worker); + if (work_bound) { + worker->flags |= IO_WORKER_F_BOUND; + wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--; + wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++; + atomic_dec(&wqe->wq->user->processes); + } else { + worker->flags &= ~IO_WORKER_F_BOUND; + wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++; + wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--; + atomic_inc(&wqe->wq->user->processes); + } + io_wqe_inc_running(wqe, worker); + } } /* @@ -335,14 +462,6 @@ static void io_worker_handle_work(struct io_worker *worker) } while (1); } -static inline bool io_wqe_run_queue(struct io_wqe *wqe) - __must_hold(wqe->lock) -{ - if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED)) - return true; - return false; -} - static int io_wqe_worker(void *data) { struct io_worker *worker = data; @@ -391,46 +510,6 @@ static int io_wqe_worker(void *data) return 0; } -/* - * Check head of free list for an available worker. If one isn't available, - * caller must wake up the wq manager to create one. - */ -static bool io_wqe_activate_free_worker(struct io_wqe *wqe) - __must_hold(RCU) -{ - struct hlist_nulls_node *n; - struct io_worker *worker; - - n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list.head)); - if (is_a_nulls(n)) - return false; - - worker = hlist_nulls_entry(n, struct io_worker, nulls_node); - if (io_worker_get(worker)) { - wake_up(&worker->wait); - io_worker_release(worker); - return true; - } - - return false; -} - -/* - * We need a worker. If we find a free one, we're good. If not, and we're - * below the max number of workers, wake up the manager to create one. - */ -static void io_wqe_wake_worker(struct io_wqe *wqe) -{ - bool ret; - - rcu_read_lock(); - ret = io_wqe_activate_free_worker(wqe); - rcu_read_unlock(); - - if (!ret && wqe->nr_workers < wqe->max_workers) - wake_up_process(wqe->wq->manager); -} - /* * Called when a worker is scheduled in. Mark us as currently running. */ @@ -444,7 +523,7 @@ void io_wq_worker_running(struct task_struct *tsk) if (worker->flags & IO_WORKER_F_RUNNING) return; worker->flags |= IO_WORKER_F_RUNNING; - atomic_inc(&wqe->nr_running); + io_wqe_inc_running(wqe, worker); } /* @@ -465,13 +544,13 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; spin_lock_irq(&wqe->lock); - if (atomic_dec_and_test(&wqe->nr_running) && io_wqe_run_queue(wqe)) - io_wqe_wake_worker(wqe); + io_wqe_dec_running(wqe, worker); spin_unlock_irq(&wqe->lock); } -static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe) +static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { + struct io_wqe_acct *acct =&wqe->acct[index]; struct io_worker *worker; worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node); @@ -484,7 +563,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe) worker->wqe = wqe; worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node, - "io_wqe_worker-%d", wqe->node); + "io_wqe_worker-%d/%d", index, wqe->node); if (IS_ERR(worker->task)) { kfree(worker); return; @@ -493,24 +572,31 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe) spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list.head); worker->flags |= IO_WORKER_F_FREE; - if (!wqe->nr_workers) + if (index == IO_WQ_ACCT_BOUND) + worker->flags |= IO_WORKER_F_BOUND; + if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; - wqe->nr_workers++; + acct->nr_workers++; spin_unlock_irq(&wqe->lock); + if (index == IO_WQ_ACCT_UNBOUND) + atomic_inc(&wq->user->processes); + wake_up_process(worker->task); } -static inline bool io_wqe_need_new_worker(struct io_wqe *wqe) +static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) __must_hold(wqe->lock) { - if (!wqe->nr_workers) - return true; - if (hlist_nulls_empty(&wqe->free_list.head) && - wqe->nr_workers < wqe->max_workers && io_wqe_run_queue(wqe)) - return true; + struct io_wqe_acct *acct = &wqe->acct[index]; - return false; + /* always ensure we have one bounded worker */ + if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers) + return true; + /* if we have available workers or no work, no need */ + if (!hlist_nulls_empty(&wqe->free_list.head) || !io_wqe_run_queue(wqe)) + return false; + return acct->nr_workers < acct->max_workers; } /* @@ -525,13 +611,18 @@ static int io_wq_manager(void *data) for (i = 0; i < wq->nr_wqes; i++) { struct io_wqe *wqe = wq->wqes[i]; - bool fork_worker = false; + bool fork_worker[2] = { false, false }; spin_lock_irq(&wqe->lock); - fork_worker = io_wqe_need_new_worker(wqe); + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) + fork_worker[IO_WQ_ACCT_BOUND] = true; + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) + fork_worker[IO_WQ_ACCT_UNBOUND] = true; spin_unlock_irq(&wqe->lock); - if (fork_worker) - create_io_worker(wq, wqe); + if (fork_worker[IO_WQ_ACCT_BOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); + if (fork_worker[IO_WQ_ACCT_UNBOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND); } set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); @@ -540,17 +631,53 @@ static int io_wq_manager(void *data) return 0; } +static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, + struct io_wq_work *work) +{ + bool free_worker; + + if (!(work->flags & IO_WQ_WORK_UNBOUND)) + return true; + if (atomic_read(&acct->nr_running)) + return true; + + rcu_read_lock(); + free_worker = !hlist_nulls_empty(&wqe->free_list.head); + rcu_read_unlock(); + if (free_worker) + return true; + + if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers && + !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN))) + return false; + + return true; +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { + struct io_wqe_acct *acct = io_work_get_acct(wqe, work); unsigned long flags; + /* + * Do early check to see if we need a new unbound worker, and if we do, + * if we're allowed to do so. This isn't 100% accurate as there's a + * gap between this check and incrementing the value, but that's OK. + * It's close enough to not be an issue, fork() has the same delay. + */ + if (unlikely(!io_wq_can_queue(wqe, acct, work))) { + work->flags |= IO_WQ_WORK_CANCEL; + work->func(&work); + return; + } + spin_lock_irqsave(&wqe->lock, flags); list_add_tail(&work->list, &wqe->work_list); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); - if (!atomic_read(&wqe->nr_running)) - io_wqe_wake_worker(wqe); + if (!atomic_read(&acct->nr_running)) + io_wqe_wake_worker(wqe, acct); } void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) @@ -828,7 +955,8 @@ void io_wq_flush(struct io_wq *wq) } } -struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm) +struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, + struct user_struct *user) { int ret = -ENOMEM, i, node; struct io_wq *wq; @@ -844,6 +972,9 @@ struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm) return ERR_PTR(-ENOMEM); } + /* caller must already hold a reference to this */ + wq->user = user; + i = 0; refcount_set(&wq->refs, wq->nr_wqes); for_each_online_node(node) { @@ -854,7 +985,13 @@ struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm) break; wq->wqes[i] = wqe; wqe->node = node; - wqe->max_workers = concurrency; + wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; + atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); + if (user) { + wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = + task_rlimit(current, RLIMIT_NPROC); + } + atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); wqe->node = node; wqe->wq = wq; spin_lock_init(&wqe->lock); @@ -863,7 +1000,6 @@ struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm) wqe->free_list.nulls = 0; INIT_HLIST_NULLS_HEAD(&wqe->busy_list.head, 1); wqe->busy_list.nulls = 1; - atomic_set(&wqe->nr_running, 0); i++; } diff --git a/fs/io-wq.h b/fs/io-wq.h index 3de192dc73fc..8cb345256f35 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -9,6 +9,7 @@ enum { IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_NEEDS_USER = 8, IO_WQ_WORK_NEEDS_FILES = 16, + IO_WQ_WORK_UNBOUND = 32, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -33,7 +34,8 @@ struct io_wq_work { (work)->files = NULL; \ } while (0) \ -struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm); +struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, + struct user_struct *user); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); diff --git a/fs/io_uring.c b/fs/io_uring.c index 4d89a2f222bf..831bea0fbc75 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3745,7 +3745,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm); + ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, NULL); if (IS_ERR(ctx->io_wq)) { ret = PTR_ERR(ctx->io_wq); ctx->io_wq = NULL; From 5f8fd2d3e0a7aa7fc9d97226be24286edd289835 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Nov 2019 10:57:36 -0700 Subject: [PATCH 42/64] io_uring: properly mark async work as bounded vs unbounded Now that io-wq supports separating the two request lifetime types, mark the following IO as having unbounded runtimes: - Any read/write to a non-regular file - Any specific networked IO - Any poll command Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 831bea0fbc75..02a4f5e8a6e4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -506,6 +506,20 @@ static inline bool io_prep_async_work(struct io_kiocb *req) case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: do_hashed = true; + /* fall-through */ + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_SENDMSG: + case IORING_OP_RECVMSG: + case IORING_OP_ACCEPT: + case IORING_OP_POLL_ADD: + /* + * We know REQ_F_ISREG is not set on some of these + * opcodes, but this enables us to keep the check in + * just one place. + */ + if (!(req->flags & REQ_F_ISREG)) + req->work.flags |= IO_WQ_WORK_UNBOUND; break; } if (io_sqe_needs_user(req->submit.sqe)) @@ -3745,7 +3759,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, NULL); + ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user); if (IS_ERR(ctx->io_wq)) { ret = PTR_ERR(ctx->io_wq); ctx->io_wq = NULL; From 206aefde4f886fdeb3b6339aacab3a85fb74cb7e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Nov 2019 18:27:42 -0700 Subject: [PATCH 43/64] io_uring: reduce/pack size of io_ring_ctx With the recent flurry of additions and changes to io_uring, the layout of io_ring_ctx has become a bit stale. We're right now at 704 bytes in size on my x86-64 build, or 11 cachelines. This patch does two things: - We have to completion structs embedded, that we only use for quiesce of the ctx (or shutdown) and for sqthread init cases. That 2x32 bytes right there, let's dynamically allocate them. - Reorder the struct a bit with an eye on cachelines, use cases, and holes. With this patch, we're down to 512 bytes, or 8 cachelines. Reviewed-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 69 ++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 02a4f5e8a6e4..710eb27bf379 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -204,6 +204,7 @@ struct io_ring_ctx { unsigned sq_mask; unsigned sq_thread_idle; unsigned cached_sq_dropped; + atomic_t cached_cq_overflow; struct io_uring_sqe *sq_sqes; struct list_head defer_list; @@ -213,25 +214,13 @@ struct io_ring_ctx { wait_queue_head_t inflight_wait; } ____cacheline_aligned_in_smp; + struct io_rings *rings; + /* IO offload */ struct io_wq *io_wq; struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; - struct completion sqo_thread_started; - - struct { - unsigned cached_cq_tail; - atomic_t cached_cq_overflow; - unsigned cq_entries; - unsigned cq_mask; - struct wait_queue_head cq_wait; - struct fasync_struct *cq_fasync; - struct eventfd_ctx *cq_ev_fd; - atomic_t cq_timeouts; - } ____cacheline_aligned_in_smp; - - struct io_rings *rings; /* * If used, fixed file set. Writers must ensure that ->refs is dead, @@ -247,7 +236,22 @@ struct io_ring_ctx { struct user_struct *user; - struct completion ctx_done; + /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ + struct completion *completions; + +#if defined(CONFIG_UNIX) + struct socket *ring_sock; +#endif + + struct { + unsigned cached_cq_tail; + unsigned cq_entries; + unsigned cq_mask; + atomic_t cq_timeouts; + struct wait_queue_head cq_wait; + struct fasync_struct *cq_fasync; + struct eventfd_ctx *cq_ev_fd; + } ____cacheline_aligned_in_smp; struct { struct mutex uring_lock; @@ -269,10 +273,6 @@ struct io_ring_ctx { spinlock_t inflight_lock; struct list_head inflight_list; } ____cacheline_aligned_in_smp; - -#if defined(CONFIG_UNIX) - struct socket *ring_sock; -#endif }; struct sqe_submit { @@ -397,7 +397,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); - complete(&ctx->ctx_done); + complete(&ctx->completions[0]); } static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) @@ -408,17 +408,19 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx) return NULL; + ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL); + if (!ctx->completions) + goto err; + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - kfree(ctx); - return NULL; - } + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + goto err; ctx->flags = p->flags; init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); - init_completion(&ctx->ctx_done); - init_completion(&ctx->sqo_thread_started); + init_completion(&ctx->completions[0]); + init_completion(&ctx->completions[1]); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); @@ -430,6 +432,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->inflight_lock); INIT_LIST_HEAD(&ctx->inflight_list); return ctx; +err: + kfree(ctx->completions); + kfree(ctx); + return NULL; } static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, @@ -3046,7 +3052,7 @@ static int io_sq_thread(void *data) unsigned inflight; unsigned long timeout; - complete(&ctx->sqo_thread_started); + complete(&ctx->completions[1]); old_fs = get_fs(); set_fs(USER_DS); @@ -3286,7 +3292,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_stop(struct io_ring_ctx *ctx) { if (ctx->sqo_thread) { - wait_for_completion(&ctx->sqo_thread_started); + wait_for_completion(&ctx->completions[1]); /* * The park is a bit of a work-around, without it we get * warning spews on shutdown with SQPOLL set and affinity @@ -4108,6 +4114,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_unaccount_mem(ctx->user, ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); + kfree(ctx->completions); kfree(ctx); } @@ -4152,7 +4159,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_iopoll_reap_events(ctx); io_cqring_overflow_flush(ctx, true); - wait_for_completion(&ctx->ctx_done); + wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); } @@ -4555,7 +4562,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * no new references will come in after we've killed the percpu ref. */ mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->ctx_done); + wait_for_completion(&ctx->completions[0]); mutex_lock(&ctx->uring_lock); switch (opcode) { @@ -4598,7 +4605,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, } /* bring the ctx back to life */ - reinit_completion(&ctx->ctx_done); + reinit_completion(&ctx->completions[0]); percpu_ref_reinit(&ctx->refs); return ret; } From a197f664a0db8a6219d9ce949f5f29b89f60fb2b Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Fri, 8 Nov 2019 08:09:12 -0700 Subject: [PATCH 44/64] io_uring: remove passed in 'ctx' function parameter ctx if possible Many times, the core of the function is req, and req has already set req->ctx at initialization time, so there is no need to pass in the ctx from the caller. Cleanup, no functional change. Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 108 ++++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 710eb27bf379..147d1f0e13cc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -438,20 +438,20 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return NULL; } -static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static inline bool __io_sequence_defer(struct io_kiocb *req) { + struct io_ring_ctx *ctx = req->ctx; + return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped + atomic_read(&ctx->cached_cq_overflow); } -static inline bool io_sequence_defer(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static inline bool io_sequence_defer(struct io_kiocb *req) { if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return __io_sequence_defer(ctx, req); + return __io_sequence_defer(req); } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) @@ -459,7 +459,7 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); - if (req && !io_sequence_defer(ctx, req)) { + if (req && !io_sequence_defer(req)) { list_del_init(&req->list); return req; } @@ -472,7 +472,7 @@ static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); - if (req && !__io_sequence_defer(ctx, req)) { + if (req && !__io_sequence_defer(req)) { list_del_init(&req->list); return req; } @@ -535,10 +535,10 @@ static inline bool io_prep_async_work(struct io_kiocb *req) return do_hashed; } -static inline void io_queue_async_work(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static inline void io_queue_async_work(struct io_kiocb *req) { bool do_hashed = io_prep_async_work(req); + struct io_ring_ctx *ctx = req->ctx; trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, req->flags); @@ -589,7 +589,7 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) continue; } req->flags |= REQ_F_IO_DRAINED; - io_queue_async_work(ctx, req); + io_queue_async_work(req); } } @@ -792,9 +792,9 @@ static void __io_free_req(struct io_kiocb *req) kmem_cache_free(req_cachep, req); } -static bool io_link_cancel_timeout(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static bool io_link_cancel_timeout(struct io_kiocb *req) { + struct io_ring_ctx *ctx = req->ctx; int ret; ret = hrtimer_try_to_cancel(&req->timeout.timer); @@ -834,7 +834,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * in this context instead of having to queue up new async work. */ if (req->flags & REQ_F_LINK_TIMEOUT) { - wake_ev = io_link_cancel_timeout(ctx, nxt); + wake_ev = io_link_cancel_timeout(nxt); /* we dropped this link, get next */ nxt = list_first_entry_or_null(&req->link_list, @@ -843,7 +843,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) *nxtptr = nxt; break; } else { - io_queue_async_work(req->ctx, nxt); + io_queue_async_work(nxt); break; } } @@ -871,7 +871,7 @@ static void io_fail_links(struct io_kiocb *req) if ((req->flags & REQ_F_LINK_TIMEOUT) && link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { - io_link_cancel_timeout(ctx, link); + io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); io_double_put_req(link); @@ -940,7 +940,7 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr) if (nxtptr) *nxtptr = nxt; else - io_queue_async_work(nxt->ctx, nxt); + io_queue_async_work(nxt); } } @@ -1899,7 +1899,7 @@ static void io_poll_remove_one(struct io_kiocb *req) WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); - io_queue_async_work(req->ctx, req); + io_queue_async_work(req); } spin_unlock(&poll->head->lock); @@ -1951,9 +1951,10 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, - __poll_t mask) +static void io_poll_complete(struct io_kiocb *req, __poll_t mask) { + struct io_ring_ctx *ctx = req->ctx; + req->poll.done = true; io_cqring_fill_event(req, mangle_poll(mask)); io_commit_cqring(ctx); @@ -1989,7 +1990,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) return; } list_del_init(&req->list); - io_poll_complete(ctx, req, mask); + io_poll_complete(req, mask); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -2017,13 +2018,13 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { list_del(&req->list); - io_poll_complete(ctx, req, mask); + io_poll_complete(req, mask); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); io_put_req(req, NULL); } else { - io_queue_async_work(ctx, req); + io_queue_async_work(req); } return 1; @@ -2108,7 +2109,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, } if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - io_poll_complete(ctx, req, mask); + io_poll_complete(req, mask); } spin_unlock_irq(&ctx->completion_lock); @@ -2355,12 +2356,13 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) +static int io_req_defer(struct io_kiocb *req) { const struct io_uring_sqe *sqe = req->submit.sqe; struct io_uring_sqe *sqe_copy; + struct io_ring_ctx *ctx = req->ctx; - if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) + if (!io_sequence_defer(req) && list_empty(&ctx->defer_list)) return 0; sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); @@ -2368,7 +2370,7 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) return -EAGAIN; spin_lock_irq(&ctx->completion_lock); - if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) { + if (!io_sequence_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(sqe_copy); return 0; @@ -2383,11 +2385,12 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) return -EIOCBQUEUED; } -static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct io_kiocb **nxt, bool force_nonblock) +static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { int ret, opcode; struct sqe_submit *s = &req->submit; + struct io_ring_ctx *ctx = req->ctx; opcode = READ_ONCE(s->sqe->opcode); switch (opcode) { @@ -2467,7 +2470,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_ring_ctx *ctx = req->ctx; struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; struct io_kiocb *nxt = NULL; @@ -2483,7 +2485,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; s->in_async = true; do { - ret = __io_submit_sqe(ctx, req, &nxt, false); + ret = __io_submit_sqe(req, &nxt, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -2537,10 +2539,10 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK]; } -static int io_req_set_file(struct io_ring_ctx *ctx, - struct io_submit_state *state, struct io_kiocb *req) +static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) { struct sqe_submit *s = &req->submit; + struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd; @@ -2580,9 +2582,10 @@ static int io_req_set_file(struct io_ring_ctx *ctx, return 0; } -static int io_grab_files(struct io_ring_ctx *ctx, struct io_kiocb *req) +static int io_grab_files(struct io_kiocb *req) { int ret = -EBADF; + struct io_ring_ctx *ctx = req->ctx; rcu_read_lock(); spin_lock_irq(&ctx->inflight_lock); @@ -2698,7 +2701,7 @@ static inline struct io_kiocb *io_get_linked_timeout(struct io_kiocb *req) return NULL; } -static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) +static int __io_queue_sqe(struct io_kiocb *req) { struct io_kiocb *nxt; int ret; @@ -2710,7 +2713,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) goto err; } - ret = __io_submit_sqe(ctx, req, NULL, true); + ret = __io_submit_sqe(req, NULL, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -2725,7 +2728,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) if (sqe_copy) { s->sqe = sqe_copy; if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { - ret = io_grab_files(ctx, req); + ret = io_grab_files(req); if (ret) { kfree(sqe_copy); goto err; @@ -2736,7 +2739,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_queue_async_work(ctx, req); + io_queue_async_work(req); return 0; } } @@ -2756,11 +2759,11 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) return ret; } -static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) +static int io_queue_sqe(struct io_kiocb *req) { int ret; - ret = io_req_defer(ctx, req); + ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); @@ -2769,17 +2772,17 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req) return 0; } - return __io_queue_sqe(ctx, req); + return __io_queue_sqe(req); } -static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct io_kiocb *shadow) +static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) { int ret; int need_submit = false; + struct io_ring_ctx *ctx = req->ctx; if (!shadow) - return io_queue_sqe(ctx, req); + return io_queue_sqe(req); /* * Mark the first IO in link list as DRAIN, let all the following @@ -2787,7 +2790,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, * list. */ req->flags |= REQ_F_IO_DRAIN; - ret = io_req_defer(ctx, req); + ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); @@ -2810,18 +2813,19 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, spin_unlock_irq(&ctx->completion_lock); if (need_submit) - return __io_queue_sqe(ctx, req); + return __io_queue_sqe(req); return 0; } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) -static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct io_submit_state *state, struct io_kiocb **link) +static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, + struct io_kiocb **link) { struct io_uring_sqe *sqe_copy; struct sqe_submit *s = &req->submit; + struct io_ring_ctx *ctx = req->ctx; int ret; req->user_data = s->sqe->user_data; @@ -2832,7 +2836,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, goto err_req; } - ret = io_req_set_file(ctx, state, req); + ret = io_req_set_file(state, req); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); @@ -2869,7 +2873,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = -EINVAL; goto err_req; } else { - io_queue_sqe(ctx, req); + io_queue_sqe(req); } } @@ -3018,7 +3022,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->submit.needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data, true, async); - io_submit_sqe(ctx, req, statep, &link); + io_submit_sqe(req, statep, &link); submitted++; /* @@ -3026,14 +3030,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, * that's the end of the chain. Submit the previous link. */ if (!(sqe_flags & IOSQE_IO_LINK) && link) { - io_queue_link_head(ctx, link, shadow_req); + io_queue_link_head(link, shadow_req); link = NULL; shadow_req = NULL; } } if (link) - io_queue_link_head(ctx, link, shadow_req); + io_queue_link_head(link, shadow_req); if (statep) io_submit_state_end(&state); From ec9c02ad4c3808d6d9ed28ad1d0485d6e2a33ac5 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Fri, 8 Nov 2019 23:50:36 +0800 Subject: [PATCH 45/64] io_uring: keep io_put_req only responsible for release and put req We already have io_put_req_find_next to find the next req of the link. we should not use the io_put_req function to find them. They should be functions of the same level. Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 73 ++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 147d1f0e13cc..1597838d5073 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -373,7 +373,7 @@ struct io_submit_state { static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void __io_free_req(struct io_kiocb *req); -static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr); +static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -559,7 +559,7 @@ static void io_kill_timeout(struct io_kiocb *req) atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); io_cqring_fill_event(req, 0); - io_put_req(req, NULL); + io_put_req(req); } } @@ -668,7 +668,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) while (!list_empty(&list)) { req = list_first_entry(&list, struct io_kiocb, list); list_del(&req->list); - io_put_req(req, NULL); + io_put_req(req); } } @@ -802,7 +802,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); req->flags &= ~REQ_F_LINK; - io_put_req(req, NULL); + io_put_req(req); return true; } @@ -921,21 +921,13 @@ static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt) * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ -static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) +static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { struct io_kiocb *nxt = NULL; if (refcount_dec_and_test(&req->refs)) io_free_req(req, &nxt); - return nxt; -} - -static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr) -{ - struct io_kiocb *nxt; - - nxt = io_put_req_find_next(req); if (nxt) { if (nxtptr) *nxtptr = nxt; @@ -944,6 +936,12 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr) } } +static void io_put_req(struct io_kiocb *req) +{ + if (refcount_dec_and_test(&req->refs)) + io_free_req(req, NULL); +} + static void io_double_put_req(struct io_kiocb *req) { /* drop both submit and complete references */ @@ -1197,15 +1195,18 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); io_complete_rw_common(kiocb, res); - io_put_req(req, NULL); + io_put_req(req); } static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *nxt = NULL; io_complete_rw_common(kiocb, res); - return io_put_req_find_next(req); + io_put_req_find_next(req, &nxt); + + return nxt; } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -1698,7 +1699,7 @@ static int io_nop(struct io_kiocb *req) return -EINVAL; io_cqring_add_event(req, 0); - io_put_req(req, NULL); + io_put_req(req); return 0; } @@ -1745,7 +1746,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req, ret); - io_put_req(req, nxt); + io_put_req_find_next(req, nxt); return 0; } @@ -1792,7 +1793,7 @@ static int io_sync_file_range(struct io_kiocb *req, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req, ret); - io_put_req(req, nxt); + io_put_req_find_next(req, nxt); return 0; } @@ -1830,7 +1831,7 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, io_cqring_add_event(req, ret); if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; - io_put_req(req, nxt); + io_put_req_find_next(req, nxt); return 0; } #endif @@ -1884,7 +1885,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req, ret); - io_put_req(req, nxt); + io_put_req_find_next(req, nxt); return 0; #else return -EOPNOTSUPP; @@ -1947,7 +1948,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) io_cqring_add_event(req, ret); if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; - io_put_req(req, NULL); + io_put_req(req); return 0; } @@ -1995,7 +1996,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) io_cqring_ev_posted(ctx); - io_put_req(req, &nxt); + io_put_req_find_next(req, &nxt); if (nxt) *workptr = &nxt->work; } @@ -2022,7 +2023,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - io_put_req(req, NULL); + io_put_req(req); } else { io_queue_async_work(req); } @@ -2115,7 +2116,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (mask) { io_cqring_ev_posted(ctx); - io_put_req(req, nxt); + io_put_req_find_next(req, nxt); } return ipt.error; } @@ -2157,7 +2158,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) io_cqring_ev_posted(ctx); if (req->flags & REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; - io_put_req(req, NULL); + io_put_req(req); return HRTIMER_NORESTART; } @@ -2200,7 +2201,7 @@ static int io_timeout_remove(struct io_kiocb *req, io_cqring_ev_posted(ctx); if (req->flags & REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; - io_put_req(req, NULL); + io_put_req(req); return 0; } @@ -2216,8 +2217,8 @@ static int io_timeout_remove(struct io_kiocb *req, spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_put_req(treq, NULL); - io_put_req(req, NULL); + io_put_req(treq); + io_put_req(req); return 0; } @@ -2352,7 +2353,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req, ret); - io_put_req(req, nxt); + io_put_req_find_next(req, nxt); return 0; } @@ -2498,13 +2499,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } /* drop submission reference */ - io_put_req(req, NULL); + io_put_req(req); if (ret) { if (req->flags & REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req, ret); - io_put_req(req, NULL); + io_put_req(req); } /* async context always use a copy of the sqe */ @@ -2635,7 +2636,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) } io_cqring_add_event(req, ret); - io_put_req(req, NULL); + io_put_req(req); return HRTIMER_NORESTART; } @@ -2667,7 +2668,7 @@ static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt) ret = 0; err: /* drop submission reference */ - io_put_req(nxt, NULL); + io_put_req(nxt); if (ret) { struct io_ring_ctx *ctx = req->ctx; @@ -2680,7 +2681,7 @@ static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt) io_cqring_fill_event(nxt, ret); trace_io_uring_fail_link(req, nxt); io_commit_cqring(ctx); - io_put_req(nxt, NULL); + io_put_req(nxt); ret = -ECANCELED; } @@ -2746,14 +2747,14 @@ static int __io_queue_sqe(struct io_kiocb *req) /* drop submission reference */ err: - io_put_req(req, NULL); + io_put_req(req); /* and drop final reference, if we failed */ if (ret) { io_cqring_add_event(req, ret); if (req->flags & REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; - io_put_req(req, NULL); + io_put_req(req); } return ret; From c69f8dbe2426cbf6150407b7e86ce85bb463c1dc Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Sat, 9 Nov 2019 11:00:08 +0800 Subject: [PATCH 46/64] io_uring: separate the io_free_req and io_free_req_find_next interface Similar to the distinction between io_put_req and io_put_req_find_next, io_free_req has been modified similarly, with no functional changes. Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1597838d5073..2b4257a965db 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -883,7 +883,7 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt) +static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { if (likely(!(req->flags & REQ_F_LINK))) { __io_free_req(req); @@ -917,6 +917,11 @@ static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt) __io_free_req(req); } +static void io_free_req(struct io_kiocb *req) +{ + io_free_req_find_next(req, NULL); +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -926,7 +931,7 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) struct io_kiocb *nxt = NULL; if (refcount_dec_and_test(&req->refs)) - io_free_req(req, &nxt); + io_free_req_find_next(req, &nxt); if (nxt) { if (nxtptr) @@ -939,7 +944,7 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) static void io_put_req(struct io_kiocb *req) { if (refcount_dec_and_test(&req->refs)) - io_free_req(req, NULL); + io_free_req(req); } static void io_double_put_req(struct io_kiocb *req) @@ -1006,7 +1011,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); } else { - io_free_req(req, NULL); + io_free_req(req); } } } From 46568e9be70ff8211d986685f08d919376c32998 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 10 Nov 2019 08:40:53 -0700 Subject: [PATCH 47/64] io_uring: fix error clear of ->file_table in io_sqe_files_register() syzbot reports that when using failslab and friends, we can get a double free in io_sqe_files_unregister(): BUG: KASAN: double-free or invalid-free in io_sqe_files_unregister+0x20b/0x300 fs/io_uring.c:3185 CPU: 1 PID: 8819 Comm: syz-executor452 Not tainted 5.4.0-rc6-next-20191108 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 kasan_report_invalid_free+0x65/0xa0 mm/kasan/report.c:468 __kasan_slab_free+0x13a/0x150 mm/kasan/common.c:450 kasan_slab_free+0xe/0x10 mm/kasan/common.c:480 __cache_free mm/slab.c:3426 [inline] kfree+0x10a/0x2c0 mm/slab.c:3757 io_sqe_files_unregister+0x20b/0x300 fs/io_uring.c:3185 io_ring_ctx_free fs/io_uring.c:3998 [inline] io_ring_ctx_wait_and_kill+0x348/0x700 fs/io_uring.c:4060 io_uring_release+0x42/0x50 fs/io_uring.c:4068 __fput+0x2ff/0x890 fs/file_table.c:280 ____fput+0x16/0x20 fs/file_table.c:313 task_work_run+0x145/0x1c0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x904/0x2e60 kernel/exit.c:817 do_group_exit+0x135/0x360 kernel/exit.c:921 __do_sys_exit_group kernel/exit.c:932 [inline] __se_sys_exit_group kernel/exit.c:930 [inline] __x64_sys_exit_group+0x44/0x50 kernel/exit.c:930 do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x43f2c8 Code: 31 b8 c5 f7 ff ff 48 8b 5c 24 28 48 8b 6c 24 30 4c 8b 64 24 38 4c 8b 6c 24 40 4c 8b 74 24 48 4c 8b 7c 24 50 48 83 c4 58 c3 66 <0f> 1f 84 00 00 00 00 00 48 8d 35 59 ca 00 00 0f b6 d2 48 89 fb 48 RSP: 002b:00007ffd5b976008 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000000000043f2c8 RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000 RBP: 00000000004bf0a8 R08: 00000000000000e7 R09: ffffffffffffffd0 R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001 R13: 00000000006d1180 R14: 0000000000000000 R15: 0000000000000000 This happens if we fail allocating the file tables. For that case we do free the file table correctly, but we forget to set it to NULL. This means that ring teardown will see it as being non-NULL, and attempt to free it again. Fix this by clearing the file_table pointer if we free the table. Reported-by: syzbot+3254bc44113ae1e331ee@syzkaller.appspotmail.com Fixes: 65e19f54d29c ("io_uring: support for larger fixed file sets") Reviewed-by: Bob Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b4257a965db..737c311c6da5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3488,6 +3488,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { kfree(ctx->file_table); + ctx->file_table = NULL; return -ENOMEM; } From 8e3cca12706231daf8daf90dbde59f1665135e48 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 9 Nov 2019 19:52:33 -0700 Subject: [PATCH 48/64] io_uring: convert accept4() -ERESTARTSYS into -EINTR If we cancel a pending accept operating with a signal, we get -ERESTARTSYS returned. Turn that into -EINTR for userspace, we should not be return -ERESTARTSYS. Fixes: 17f2fe35d080 ("io_uring: add support for IORING_OP_ACCEPT") Reported-by: Hrvoje Zeba Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 737c311c6da5..bbb3889dae41 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1887,6 +1887,8 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->work.flags |= IO_WQ_WORK_NEEDS_FILES; return -EAGAIN; } + if (ret == -ERESTARTSYS) + ret = -EINTR; if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; io_cqring_add_event(req, ret); From 0ddf92e848ab7abf216f218ee363eb9b9650e98f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 8 Nov 2019 08:52:53 -0700 Subject: [PATCH 49/64] io_uring: provide fallback request for OOM situations One thing that really sucks for userspace APIs is if the kernel passes back -ENOMEM/-EAGAIN for resource shortages. The application really has no idea of what to do in those cases. Should it try and reap completions? Probably a good idea. Will it solve the issue? Who knows. This patch adds a simple fallback mechanism if we fail to allocate memory for a request. If we fail allocating memory from the slab for a request, we punt to a pre-allocated request. There's just one of these per io_ring_ctx, but the important part is if we ever return -EBUSY to the application, the applications knows that it can wait for events and make forward progress when events have completed. This is the important part. Signed-off-by: Jens Axboe --- fs/io_uring.c | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bbb3889dae41..2c838baf11b4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -239,6 +239,9 @@ struct io_ring_ctx { /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ struct completion *completions; + /* if all else fails... */ + struct io_kiocb *fallback_req; + #if defined(CONFIG_UNIX) struct socket *ring_sock; #endif @@ -408,6 +411,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx) return NULL; + ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL); + if (!ctx->fallback_req) + goto err; + ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL); if (!ctx->completions) goto err; @@ -433,6 +440,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->inflight_list); return ctx; err: + if (ctx->fallback_req) + kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx->completions); kfree(ctx); return NULL; @@ -712,6 +721,23 @@ static void io_cqring_add_event(struct io_kiocb *req, long res) io_cqring_ev_posted(ctx); } +static inline bool io_is_fallback_req(struct io_kiocb *req) +{ + return req == (struct io_kiocb *) + ((unsigned long) req->ctx->fallback_req & ~1UL); +} + +static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + req = ctx->fallback_req; + if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req)) + return req; + + return NULL; +} + static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { @@ -724,7 +750,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, if (!state) { req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) - goto out; + goto fallback; } else if (!state->free_reqs) { size_t sz; int ret; @@ -739,7 +765,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, if (unlikely(ret <= 0)) { state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); if (!state->reqs[0]) - goto out; + goto fallback; ret = 1; } state->free_reqs = ret - 1; @@ -751,6 +777,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, state->cur_req++; } +got_it: req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -759,7 +786,10 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->result = 0; INIT_IO_WORK(&req->work, io_wq_submit_work); return req; -out: +fallback: + req = io_get_fallback_req(ctx); + if (req) + goto got_it; percpu_ref_put(&ctx->refs); return NULL; } @@ -789,7 +819,10 @@ static void __io_free_req(struct io_kiocb *req) spin_unlock_irqrestore(&ctx->inflight_lock, flags); } percpu_ref_put(&ctx->refs); - kmem_cache_free(req_cachep, req); + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) ctx->fallback_req); } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1005,8 +1038,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * completions for those, only batch free for fixed * file and non-linked commands. */ - if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == - REQ_F_FIXED_FILE) { + if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -4128,6 +4161,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); kfree(ctx->completions); + kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); } From 47f467686ec02fc07fd5c6bb34b6f6736e2884b0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 9 Nov 2019 17:43:02 -0700 Subject: [PATCH 50/64] io_uring: make ASYNC_CANCEL work with poll and timeout It's a little confusing that we have multiple types of command cancellation opcodes now that we have a generic one. Make the generic one work with POLL_ADD and TIMEOUT commands as well, that makes for an easier to use API for the application. The fact that they currently don't is a bit confusing. Add a helper that takes care of it, so we can user it from both IORING_OP_ASYNC_CANCEL and from the linked timeout cancellation. Reported-by: Hrvoje Zeba Signed-off-by: Jens Axboe --- fs/io_uring.c | 138 +++++++++++++++++++++++++++++--------------------- 1 file changed, 80 insertions(+), 58 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2c838baf11b4..b70982502336 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1959,6 +1959,20 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->completion_lock); } +static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) +{ + struct io_kiocb *req; + + list_for_each_entry(req, &ctx->cancel_list, list) { + if (req->user_data != sqe_addr) + continue; + io_poll_remove_one(req); + return 0; + } + + return -ENOENT; +} + /* * Find a running poll command that matches one specified in sqe->addr, * and remove it if found. @@ -1966,8 +1980,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *poll_req, *next; - int ret = -ENOENT; + int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -1976,13 +1989,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) { - if (READ_ONCE(sqe->addr) == poll_req->user_data) { - io_poll_remove_one(poll_req); - ret = 0; - break; - } - } + ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr)); spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req, ret); @@ -2202,6 +2209,31 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +{ + struct io_kiocb *req; + int ret = -ENOENT; + + list_for_each_entry(req, &ctx->timeout_list, list) { + if (user_data == req->user_data) { + list_del_init(&req->list); + ret = 0; + break; + } + } + + if (ret == -ENOENT) + return ret; + + ret = hrtimer_try_to_cancel(&req->timeout.timer); + if (ret == -1) + return -EALREADY; + + io_cqring_fill_event(req, -ECANCELED); + io_put_req(req); + return 0; +} + /* * Remove or update an existing timeout command */ @@ -2209,10 +2241,8 @@ static int io_timeout_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *treq; - int ret = -ENOENT; - __u64 user_data; unsigned flags; + int ret; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -2222,42 +2252,15 @@ static int io_timeout_remove(struct io_kiocb *req, if (flags) return -EINVAL; - user_data = READ_ONCE(sqe->addr); spin_lock_irq(&ctx->completion_lock); - list_for_each_entry(treq, &ctx->timeout_list, list) { - if (user_data == treq->user_data) { - list_del_init(&treq->list); - ret = 0; - break; - } - } + ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr)); - /* didn't find timeout */ - if (ret) { -fill_ev: - io_cqring_fill_event(req, ret); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; - io_put_req(req); - return 0; - } - - ret = hrtimer_try_to_cancel(&treq->timeout.timer); - if (ret == -1) { - ret = -EBUSY; - goto fill_ev; - } - - io_cqring_fill_event(req, 0); - io_cqring_fill_event(treq, -ECANCELED); + io_cqring_fill_event(req, ret); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - - io_put_req(treq); + if (ret < 0 && req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_put_req(req); return 0; } @@ -2374,12 +2377,39 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) return ret; } +static void io_async_find_and_cancel(struct io_ring_ctx *ctx, + struct io_kiocb *req, __u64 sqe_addr, + struct io_kiocb **nxt) +{ + unsigned long flags; + int ret; + + ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr); + if (ret != -ENOENT) { + spin_lock_irqsave(&ctx->completion_lock, flags); + goto done; + } + + spin_lock_irqsave(&ctx->completion_lock, flags); + ret = io_timeout_cancel(ctx, sqe_addr); + if (ret != -ENOENT) + goto done; + ret = io_poll_cancel(ctx, sqe_addr); +done: + io_cqring_fill_event(req, ret); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); + + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_put_req_find_next(req, nxt); +} + static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt) { struct io_ring_ctx *ctx = req->ctx; - void *sqe_addr; - int ret; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -2387,13 +2417,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, sqe->cancel_flags) return -EINVAL; - sqe_addr = (void *) (unsigned long) READ_ONCE(sqe->addr); - ret = io_async_cancel_one(ctx, sqe_addr); - - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL); return 0; } @@ -2655,7 +2679,6 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *prev = NULL; unsigned long flags; - int ret = -ETIME; spin_lock_irqsave(&ctx->completion_lock, flags); @@ -2671,12 +2694,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - void *user_data = (void *) (unsigned long) prev->user_data; - ret = io_async_cancel_one(ctx, user_data); + io_async_find_and_cancel(ctx, req, prev->user_data, NULL); + } else { + io_cqring_add_event(req, -ETIME); + io_put_req(req); } - - io_cqring_add_event(req, ret); - io_put_req(req); return HRTIMER_NORESTART; } From c1edbf5f081be9fbbea68c1d564b773e59c1acf3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 10 Nov 2019 16:56:04 -0700 Subject: [PATCH 51/64] io_uring: flag SQPOLL busy condition to userspace Now that we have backpressure, for SQPOLL, we have one more condition that warrants flagging that the application needs to enter the kernel: we failed to submit IO due to backpressure. Make sure we catch that and flag it appropriately. If we run into backpressure issues with the SQPOLL thread, flag it as such to the application by setting IORING_SQ_NEED_WAKEUP. This will cause the application to enter the kernel, and that will flush the backlog and clear the condition. Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b70982502336..912d2648f8db 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3118,16 +3118,16 @@ static int io_sq_thread(void *data) DEFINE_WAIT(wait); unsigned inflight; unsigned long timeout; + int ret; complete(&ctx->completions[1]); old_fs = get_fs(); set_fs(USER_DS); - timeout = inflight = 0; + ret = timeout = inflight = 0; while (!kthread_should_park()) { unsigned int to_submit; - int ret; if (inflight) { unsigned nr_events = 0; @@ -3161,13 +3161,21 @@ static int io_sq_thread(void *data) } to_submit = io_sqring_entries(ctx); - if (!to_submit) { + + /* + * If submit got -EBUSY, flag us as needing the application + * to enter the kernel to reap and flush events. + */ + if (!to_submit || ret == -EBUSY) { /* * We're polling. If we're within the defined idle * period, then let us spin without work before going - * to sleep. + * to sleep. The exception is if we got EBUSY doing + * more IO, we should wait for the application to + * reap events and wake us up. */ - if (inflight || !time_after(jiffies, timeout)) { + if (inflight || + (!time_after(jiffies, timeout) && ret != -EBUSY)) { cond_resched(); continue; } @@ -3193,7 +3201,7 @@ static int io_sq_thread(void *data) smp_mb(); to_submit = io_sqring_entries(ctx); - if (!to_submit) { + if (!to_submit || ret == -EBUSY) { if (kthread_should_park()) { finish_wait(&ctx->sqo_wait, &wait); break; @@ -4351,6 +4359,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { + if (!list_empty_careful(&ctx->cq_overflow_list)) + io_cqring_overflow_flush(ctx, false); if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sqo_wait); submitted = to_submit; From 768134d4f48109b90f4248feecbeeb7d684e410c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 10 Nov 2019 20:30:53 -0700 Subject: [PATCH 52/64] io_uring: don't do flush cancel under inflight_lock We can't safely cancel under the inflight lock. If the work hasn't been started yet, then io_wq_cancel_work() simply marks the work as cancelled and invokes the work handler. But if the work completion needs to grab the inflight lock because it's grabbing user files, then we'll deadlock trying to finish the work as we already hold that lock. Instead grab a reference to the request, if it isn't already zero. If it's zero, then we know it's going through completion anyway, and we can safely ignore it. If it's not zero, then we can drop the lock and attempt to cancel from there. This also fixes a missing finish_wait() at the end of io_uring_cancel_files(). Signed-off-by: Jens Axboe --- fs/io_uring.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 912d2648f8db..dcb0602c9fd2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4257,33 +4257,34 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, while (!list_empty_careful(&ctx->inflight_list)) { enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; + struct io_kiocb *cancel_req = NULL; spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (req->work.files == files) { - ret = io_wq_cancel_work(ctx->io_wq, &req->work); - break; - } + if (req->work.files != files) + continue; + /* req is being completed, ignore */ + if (!refcount_inc_not_zero(&req->refs)) + continue; + cancel_req = req; + break; } - if (ret == IO_WQ_CANCEL_RUNNING) + if (cancel_req) prepare_to_wait(&ctx->inflight_wait, &wait, - TASK_UNINTERRUPTIBLE); - + TASK_UNINTERRUPTIBLE); spin_unlock_irq(&ctx->inflight_lock); - /* - * We need to keep going until we get NOTFOUND. We only cancel - * one work at the time. - * - * If we get CANCEL_RUNNING, then wait for a work to complete - * before continuing. - */ - if (ret == IO_WQ_CANCEL_OK) - continue; - else if (ret != IO_WQ_CANCEL_RUNNING) + if (cancel_req) { + ret = io_wq_cancel_work(ctx->io_wq, &cancel_req->work); + io_put_req(cancel_req); + } + + /* We need to keep going until we don't find a matching req */ + if (!cancel_req) break; schedule(); } + finish_wait(&ctx->inflight_wait, &wait); } static int io_uring_flush(struct file *file, void *data) From 76a46e066e2d93bd333599d1c84c605c2c4cc909 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 10 Nov 2019 23:34:16 -0700 Subject: [PATCH 53/64] io_uring: fix -ENOENT issue with linked timer with short timeout If you prep a read (for example) that needs to get punted to async context with a timer, if the timeout is sufficiently short, the timer request will get completed with -ENOENT as it could not find the read. The issue is that we prep and start the timer before we start the read. Hence the timer can trigger before the read is even started, and the end result is then that the timer completes with -ENOENT, while the read starts instead of being cancelled by the timer. Fix this by splitting the linked timer into two parts: 1) Prep and validate the linked timer 2) Start timer The read is then started between steps 1 and 2, so we know that the timer will always have a consistent view of the read request state. Reported-by: Hrvoje Zeba Signed-off-by: Jens Axboe --- fs/io_uring.c | 127 +++++++++++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 53 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index dcb0602c9fd2..ad7f569319c2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -855,7 +855,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) */ nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); while (nxt) { - list_del(&nxt->list); + list_del_init(&nxt->list); if (!list_empty(&req->link_list)) { INIT_LIST_HEAD(&nxt->link_list); list_splice(&req->link_list, &nxt->link_list); @@ -2688,13 +2688,17 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) */ if (!list_empty(&req->list)) { prev = list_entry(req->list.prev, struct io_kiocb, link_list); - list_del_init(&req->list); + if (refcount_inc_not_zero(&prev->refs)) + list_del_init(&req->list); + else + prev = NULL; } spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { io_async_find_and_cancel(ctx, req, prev->user_data, NULL); + io_put_req(prev); } else { io_cqring_add_event(req, -ETIME); io_put_req(req); @@ -2702,78 +2706,84 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt) +static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, + enum hrtimer_mode *mode) { - const struct io_uring_sqe *sqe = nxt->submit.sqe; - enum hrtimer_mode mode; - struct timespec64 ts; - int ret = -EINVAL; + struct io_ring_ctx *ctx = req->ctx; - if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off) - goto err; - if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS) - goto err; - if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) { - ret = -EFAULT; - goto err; + /* + * If the list is now empty, then our linked request finished before + * we got a chance to setup the timer + */ + spin_lock_irq(&ctx->completion_lock); + if (!list_empty(&req->list)) { + req->timeout.timer.function = io_link_timeout_fn; + hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts), + *mode); } + spin_unlock_irq(&ctx->completion_lock); - req->flags |= REQ_F_LINK_TIMEOUT; - - if (sqe->timeout_flags & IORING_TIMEOUT_ABS) - mode = HRTIMER_MODE_ABS; - else - mode = HRTIMER_MODE_REL; - hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, mode); - nxt->timeout.timer.function = io_link_timeout_fn; - hrtimer_start(&nxt->timeout.timer, timespec64_to_ktime(ts), mode); - ret = 0; -err: /* drop submission reference */ - io_put_req(nxt); - - if (ret) { - struct io_ring_ctx *ctx = req->ctx; - - /* - * Break the link and fail linked timeout, parent will get - * failed by the regular submission path. - */ - list_del(&nxt->list); - io_cqring_fill_event(nxt, ret); - trace_io_uring_fail_link(req, nxt); - io_commit_cqring(ctx); - io_put_req(nxt); - ret = -ECANCELED; - } - - return ret; + io_put_req(req); } -static inline struct io_kiocb *io_get_linked_timeout(struct io_kiocb *req) +static int io_validate_link_timeout(const struct io_uring_sqe *sqe, + struct timespec64 *ts) +{ + if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off) + return -EINVAL; + if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS) + return -EINVAL; + if (get_timespec64(ts, u64_to_user_ptr(sqe->addr))) + return -EFAULT; + + return 0; +} + +static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req, + struct timespec64 *ts, + enum hrtimer_mode *mode) { struct io_kiocb *nxt; + int ret; if (!(req->flags & REQ_F_LINK)) return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - if (nxt && nxt->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) - return nxt; + if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT) + return NULL; - return NULL; + ret = io_validate_link_timeout(nxt->submit.sqe, ts); + if (ret) { + list_del_init(&nxt->list); + io_cqring_add_event(nxt, ret); + io_double_put_req(nxt); + return ERR_PTR(-ECANCELED); + } + + if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS) + *mode = HRTIMER_MODE_ABS; + else + *mode = HRTIMER_MODE_REL; + + req->flags |= REQ_F_LINK_TIMEOUT; + hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode); + return nxt; } static int __io_queue_sqe(struct io_kiocb *req) { + enum hrtimer_mode mode; struct io_kiocb *nxt; + struct timespec64 ts; int ret; - nxt = io_get_linked_timeout(req); - if (unlikely(nxt)) { - ret = io_queue_linked_timeout(req, nxt); - if (ret) - goto err; + nxt = io_prep_linked_timeout(req, &ts, &mode); + if (IS_ERR(nxt)) { + ret = PTR_ERR(nxt); + nxt = NULL; + goto err; } ret = __io_submit_sqe(req, NULL, true); @@ -2803,14 +2813,25 @@ static int __io_queue_sqe(struct io_kiocb *req) * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); + + if (nxt) + io_queue_linked_timeout(nxt, &ts, &mode); + return 0; } } - /* drop submission reference */ err: + /* drop submission reference */ io_put_req(req); + if (nxt) { + if (!ret) + io_queue_linked_timeout(nxt, &ts, &mode); + else + io_put_req(nxt); + } + /* and drop final reference, if we failed */ if (ret) { io_cqring_add_event(req, ret); From 960e432dfa5927892a9b170d14de874597b84849 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Nov 2019 07:56:39 -0700 Subject: [PATCH 54/64] io_uring: use correct "is IO worker" helper Since we switched to io-wq, the dependent link optimization for when to pass back work inline has been broken. Fix this by providing a suitable io-wq helper for io_uring to use to detect when to do this. Fixes: 561fb04a6a22 ("io_uring: replace workqueue usage with io-wq") Signed-off-by: Jens Axboe --- fs/io-wq.h | 4 ++++ fs/io_uring.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/io-wq.h b/fs/io-wq.h index 8cb345256f35..cc50754d028c 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -62,4 +62,8 @@ static inline void io_wq_worker_running(struct task_struct *tsk) } #endif +static inline bool io_wq_current_is_worker(void) +{ + return in_task() && (current->flags & PF_IO_WORKER); +} #endif diff --git a/fs/io_uring.c b/fs/io_uring.c index ad7f569319c2..3c573f0578a8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -872,7 +872,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) /* we dropped this link, get next */ nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - } else if (nxtptr && current_work()) { + } else if (nxtptr && io_wq_current_is_worker()) { *nxtptr = nxt; break; } else { From 7c9e7f0fe0d8abf856a957c150c48778e75154c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Nov 2019 08:15:53 -0700 Subject: [PATCH 55/64] io_uring: fix potential deadlock in io_poll_wake() We attempt to run the poll completion inline, but we're using trylock to do so. This avoids a deadlock since we're grabbing the locks in reverse order at this point, we already hold the poll wq lock and we're trying to grab the completion lock, while the normal rules are the reverse of that order. IO completion for a timeout link will need to grab the completion lock, but that's not safe from this context. Put the completion under the completion_lock in io_poll_wake(), and mark the request as entering the completion with the completion_lock already held. Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts") Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3c573f0578a8..247e5e1137a3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -341,6 +341,7 @@ struct io_kiocb { #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ #define REQ_F_INFLIGHT 8192 /* on inflight list */ +#define REQ_F_COMP_LOCKED 16384 /* completion under lock */ u64 user_data; u32 result; u32 sequence; @@ -931,14 +932,15 @@ static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) */ if (req->flags & REQ_F_FAIL_LINK) { io_fail_links(req); - } else if (req->flags & REQ_F_LINK_TIMEOUT) { + } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) == + REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; /* * If this is a timeout link, we could be racing with the * timeout timer. Grab the completion lock for this case to - * protection against that. + * protect against that. */ spin_lock_irqsave(&ctx->completion_lock, flags); io_req_link_next(req, nxt); @@ -2064,13 +2066,20 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del_init(&poll->wait.entry); + /* + * Run completion inline if we can. We're using trylock here because + * we are violating the completion_lock -> poll wq lock ordering. + * If we have a link timeout we're going to need the completion_lock + * for finalizing the request, mark us as having grabbed that already. + */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { list_del(&req->list); io_poll_complete(req, mask); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - io_put_req(req); } else { io_queue_async_work(req); } From 15dff286d0e0087d4dcd7049911f179e4e4cfd94 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Nov 2019 09:09:23 -0700 Subject: [PATCH 56/64] io_uring: check for validity of ->rings in teardown Normally the rings are always valid, the exception is if we failed to allocate the rings at setup time. syzbot reports this: RSP: 002b:00007ffd6e8aa078 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000441229 RDX: 0000000000000002 RSI: 0000000020000140 RDI: 0000000000000d0d RBP: 00007ffd6e8aa090 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffffffffff R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000 kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8903 Comm: syz-executor410 Not tainted 5.4.0-rc7-next-20191113 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:199 [inline] RIP: 0010:__io_commit_cqring fs/io_uring.c:496 [inline] RIP: 0010:io_commit_cqring+0x1e1/0xdb0 fs/io_uring.c:592 Code: 03 0f 8e df 09 00 00 48 8b 45 d0 4c 8d a3 c0 00 00 00 4c 89 e2 48 c1 ea 03 44 8b b8 c0 01 00 00 48 b8 00 00 00 00 00 fc ff df <0f> b6 14 02 4c 89 e0 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 61 RSP: 0018:ffff88808f51fc08 EFLAGS: 00010006 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff815abe4a RDX: 0000000000000018 RSI: ffffffff81d168d5 RDI: ffff8880a9166100 RBP: ffff88808f51fc70 R08: 0000000000000004 R09: ffffed1011ea3f7d R10: ffffed1011ea3f7c R11: 0000000000000003 R12: 00000000000000c0 R13: ffff8880a91661c0 R14: 1ffff1101522cc10 R15: 0000000000000000 FS: 0000000001e7a880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000140 CR3: 000000009a74c000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: io_cqring_overflow_flush+0x6b9/0xa90 fs/io_uring.c:673 io_ring_ctx_wait_and_kill+0x24f/0x7c0 fs/io_uring.c:4260 io_uring_create fs/io_uring.c:4600 [inline] io_uring_setup+0x1256/0x1cc0 fs/io_uring.c:4626 __do_sys_io_uring_setup fs/io_uring.c:4639 [inline] __se_sys_io_uring_setup fs/io_uring.c:4636 [inline] __x64_sys_io_uring_setup+0x54/0x80 fs/io_uring.c:4636 do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x441229 Code: e8 5c ae 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 bb 0a fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffd6e8aa078 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000441229 RDX: 0000000000000002 RSI: 0000000020000140 RDI: 0000000000000d0d RBP: 00007ffd6e8aa090 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffffffffff R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: ---[ end trace b0f5b127a57f623f ]--- RIP: 0010:__read_once_size include/linux/compiler.h:199 [inline] RIP: 0010:__io_commit_cqring fs/io_uring.c:496 [inline] RIP: 0010:io_commit_cqring+0x1e1/0xdb0 fs/io_uring.c:592 Code: 03 0f 8e df 09 00 00 48 8b 45 d0 4c 8d a3 c0 00 00 00 4c 89 e2 48 c1 ea 03 44 8b b8 c0 01 00 00 48 b8 00 00 00 00 00 fc ff df <0f> b6 14 02 4c 89 e0 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 61 RSP: 0018:ffff88808f51fc08 EFLAGS: 00010006 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff815abe4a RDX: 0000000000000018 RSI: ffffffff81d168d5 RDI: ffff8880a9166100 RBP: ffff88808f51fc70 R08: 0000000000000004 R09: ffffed1011ea3f7d R10: ffffed1011ea3f7c R11: 0000000000000003 R12: 00000000000000c0 R13: ffff8880a91661c0 R14: 1ffff1101522cc10 R15: 0000000000000000 FS: 0000000001e7a880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000140 CR3: 000000009a74c000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 which is exactly the case of failing to allocate the SQ/CQ rings, and then entering shutdown. Check if the rings are valid before trying to access them at shutdown time. Reported-by: syzbot+21147d79607d724bd6f3@syzkaller.appspotmail.com Fixes: 1d7bb1d50fb4 ("io_uring: add support for backlogged CQ ring") Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 247e5e1137a3..99822bf89924 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4265,7 +4265,9 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_wq_cancel_all(ctx->io_wq); io_iopoll_reap_events(ctx); - io_cqring_overflow_flush(ctx, true); + /* if we failed setting up the ctx, we might not have any rings */ + if (ctx->rings) + io_cqring_overflow_flush(ctx, true); wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); } From 7d7230652e7c788ef908536fd79f4cca077f269f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Nov 2019 22:31:31 -0700 Subject: [PATCH 57/64] io_wq: add get/put_work handlers to io_wq_create() For cancellation, we need to ensure that the work item stays valid for as long as ->cur_work is valid. Right now we can't safely dereference the work item even under the wqe->lock, because while the ->cur_work pointer will remain valid, the work could be completing and be freed in parallel. Only invoke ->get/put_work() on items we know that the caller queued themselves. Add IO_WQ_WORK_INTERNAL for io-wq to use, which is needed when we're queueing a flush item, for instance. Signed-off-by: Jens Axboe --- fs/io-wq.c | 25 +++++++++++++++++++++++-- fs/io-wq.h | 7 ++++++- fs/io_uring.c | 17 ++++++++++++++++- 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 33b14b85752b..26d81540c1fc 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -106,6 +106,9 @@ struct io_wq { unsigned long state; unsigned nr_wqes; + get_work_fn *get_work; + put_work_fn *put_work; + struct task_struct *manager; struct user_struct *user; struct mm_struct *mm; @@ -392,7 +395,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { - struct io_wq_work *work, *old_work; + struct io_wq_work *work, *old_work = NULL, *put_work = NULL; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; @@ -424,6 +427,8 @@ static void io_worker_handle_work(struct io_worker *worker) wqe->flags |= IO_WQE_FLAG_STALLED; spin_unlock_irq(&wqe->lock); + if (put_work && wq->put_work) + wq->put_work(old_work); if (!work) break; next: @@ -444,6 +449,11 @@ static void io_worker_handle_work(struct io_worker *worker) if (worker->mm) work->flags |= IO_WQ_WORK_HAS_MM; + if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) { + put_work = work; + wq->get_work(work); + } + old_work = work; work->func(&work); @@ -455,6 +465,12 @@ static void io_worker_handle_work(struct io_worker *worker) } if (work && work != old_work) { spin_unlock_irq(&wqe->lock); + + if (put_work && wq->put_work) { + wq->put_work(put_work); + put_work = NULL; + } + /* dependent work not hashed */ hash = -1U; goto next; @@ -950,13 +966,15 @@ void io_wq_flush(struct io_wq *wq) init_completion(&data.done); INIT_IO_WORK(&data.work, io_wq_flush_func); + data.work.flags |= IO_WQ_WORK_INTERNAL; io_wqe_enqueue(wqe, &data.work); wait_for_completion(&data.done); } } struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, - struct user_struct *user) + struct user_struct *user, get_work_fn *get_work, + put_work_fn *put_work) { int ret = -ENOMEM, i, node; struct io_wq *wq; @@ -972,6 +990,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, return ERR_PTR(-ENOMEM); } + wq->get_work = get_work; + wq->put_work = put_work; + /* caller must already hold a reference to this */ wq->user = user; diff --git a/fs/io-wq.h b/fs/io-wq.h index cc50754d028c..4b29f922f80c 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -10,6 +10,7 @@ enum { IO_WQ_WORK_NEEDS_USER = 8, IO_WQ_WORK_NEEDS_FILES = 16, IO_WQ_WORK_UNBOUND = 32, + IO_WQ_WORK_INTERNAL = 64, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -34,8 +35,12 @@ struct io_wq_work { (work)->files = NULL; \ } while (0) \ +typedef void (get_work_fn)(struct io_wq_work *); +typedef void (put_work_fn)(struct io_wq_work *); + struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, - struct user_struct *user); + struct user_struct *user, + get_work_fn *get_work, put_work_fn *put_work); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); diff --git a/fs/io_uring.c b/fs/io_uring.c index 99822bf89924..e1a3b8b667e0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3822,6 +3822,20 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, return done ? done : err; } +static void io_put_work(struct io_wq_work *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + io_put_req(req); +} + +static void io_get_work(struct io_wq_work *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + refcount_inc(&req->refs); +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { @@ -3871,7 +3885,8 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user); + ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user, + io_get_work, io_put_work); if (IS_ERR(ctx->io_wq)) { ret = PTR_ERR(ctx->io_wq); ctx->io_wq = NULL; From 36c2f9223e84c1aa84bfba90cb2e74b517c92a54 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Nov 2019 09:43:34 -0700 Subject: [PATCH 58/64] io-wq: ensure we have a stable view of ->cur_work for cancellations worker->cur_work is currently protected by the lock of the wqe that the worker belongs to. When we send a signal to a worker, we need a stable view of ->cur_work, so we need to hold that lock. But this doesn't work so well, since we have the opposite order potentially on queueing work. If POLL_ADD is used with a signalfd, then io_poll_wake() is called with the signal lock, and that sometimes needs to insert work items. Add a specific worker lock that protects the current work item. Then we can guarantee that the task we're sending a signal is currently processing the exact work we think it is. Reported-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Jens Axboe --- fs/io-wq.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 26d81540c1fc..4031b75541be 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -49,7 +49,9 @@ struct io_worker { struct task_struct *task; wait_queue_head_t wait; struct io_wqe *wqe; + struct io_wq_work *cur_work; + spinlock_t lock; struct rcu_head rcu; struct mm_struct *mm; @@ -323,7 +325,6 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list.head); } - worker->cur_work = work; /* * If worker is moving from bound to unbound (or vice versa), then @@ -402,17 +403,6 @@ static void io_worker_handle_work(struct io_worker *worker) do { unsigned hash = -1U; - /* - * Signals are either sent to cancel specific work, or to just - * cancel all work items. For the former, ->cur_work must - * match. ->cur_work is NULL at this point, since we haven't - * assigned any work, so it's safe to flush signals for that - * case. For the latter case of cancelling all work, the caller - * wil have set IO_WQ_BIT_CANCEL. - */ - if (signal_pending(current)) - flush_signals(current); - /* * If we got some work, mark us as busy. If we didn't, but * the list isn't empty, it means we stalled on hashed work. @@ -432,6 +422,14 @@ static void io_worker_handle_work(struct io_worker *worker) if (!work) break; next: + /* flush any pending signals before assigning new work */ + if (signal_pending(current)) + flush_signals(current); + + spin_lock_irq(&worker->lock); + worker->cur_work = work; + spin_unlock_irq(&worker->lock); + if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && current->files != work->files) { task_lock(current); @@ -457,8 +455,12 @@ static void io_worker_handle_work(struct io_worker *worker) old_work = work; work->func(&work); - spin_lock_irq(&wqe->lock); + spin_lock_irq(&worker->lock); worker->cur_work = NULL; + spin_unlock_irq(&worker->lock); + + spin_lock_irq(&wqe->lock); + if (hash != -1U) { wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; @@ -577,6 +579,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) worker->nulls_node.pprev = NULL; init_waitqueue_head(&worker->wait); worker->wqe = wqe; + spin_lock_init(&worker->lock); worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node, "io_wqe_worker-%d/%d", index, wqe->node); @@ -783,21 +786,20 @@ struct io_cb_cancel_data { static bool io_work_cancel(struct io_worker *worker, void *cancel_data) { struct io_cb_cancel_data *data = cancel_data; - struct io_wqe *wqe = data->wqe; unsigned long flags; bool ret = false; /* * Hold the lock to avoid ->cur_work going out of scope, caller - * may deference the passed in work. + * may dereference the passed in work. */ - spin_lock_irqsave(&wqe->lock, flags); + spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && data->cancel(worker->cur_work, data->caller_data)) { send_sig(SIGINT, worker->task, 1); ret = true; } - spin_unlock_irqrestore(&wqe->lock, flags); + spin_unlock_irqrestore(&worker->lock, flags); return ret; } @@ -864,13 +866,20 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_wq_work *work = data; + unsigned long flags; + bool ret = false; + if (worker->cur_work != work) + return false; + + spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work == work) { send_sig(SIGINT, worker->task, 1); - return true; + ret = true; } + spin_unlock_irqrestore(&worker->lock, flags); - return false; + return ret; } static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, From e61df66c69b11bc050d233dc95714a6339192c28 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Nov 2019 13:54:49 -0700 Subject: [PATCH 59/64] io-wq: ensure free/busy list browsing see all items We have two lists for workers in io-wq, a busy and a free list. For certain operations we want to browse all workers, and we currently do that by browsing the two separate lists. But since these lists are RCU protected, we can potentially miss workers if they move between the two lists while we're browsing them. Add a third list, all_list, that simply holds all workers. A worker is added to that list when it starts, and removed when it exits. This makes the worker iteration cleaner, too. Reported-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Jens Axboe --- fs/io-wq.c | 41 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 4031b75541be..fcb6c74209da 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -46,6 +46,7 @@ struct io_worker { refcount_t ref; unsigned flags; struct hlist_nulls_node nulls_node; + struct list_head all_list; struct task_struct *task; wait_queue_head_t wait; struct io_wqe *wqe; @@ -96,6 +97,7 @@ struct io_wqe { struct io_wq_nulls_list free_list; struct io_wq_nulls_list busy_list; + struct list_head all_list; struct io_wq *wq; }; @@ -212,6 +214,7 @@ static void io_worker_exit(struct io_worker *worker) spin_lock_irq(&wqe->lock); hlist_nulls_del_rcu(&worker->nulls_node); + list_del_rcu(&worker->all_list); if (__io_worker_unuse(wqe, worker)) { __release(&wqe->lock); spin_lock_irq(&wqe->lock); @@ -590,6 +593,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list.head); + list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; if (index == IO_WQ_ACCT_BOUND) worker->flags |= IO_WORKER_F_BOUND; @@ -733,16 +737,13 @@ static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) * worker that isn't exiting */ static bool io_wq_for_each_worker(struct io_wqe *wqe, - struct io_wq_nulls_list *list, bool (*func)(struct io_worker *, void *), void *data) { - struct hlist_nulls_node *n; struct io_worker *worker; bool ret = false; -restart: - hlist_nulls_for_each_entry_rcu(worker, n, &list->head, nulls_node) { + list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { if (io_worker_get(worker)) { ret = func(worker, data); io_worker_release(worker); @@ -750,8 +751,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, break; } } - if (!ret && get_nulls_value(n) != list->nulls) - goto restart; + return ret; } @@ -769,10 +769,7 @@ void io_wq_cancel_all(struct io_wq *wq) for (i = 0; i < wq->nr_wqes; i++) { struct io_wqe *wqe = wq->wqes[i]; - io_wq_for_each_worker(wqe, &wqe->busy_list, - io_wqe_worker_send_sig, NULL); - io_wq_for_each_worker(wqe, &wqe->free_list, - io_wqe_worker_send_sig, NULL); + io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL); } rcu_read_unlock(); } @@ -834,14 +831,7 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, } rcu_read_lock(); - found = io_wq_for_each_worker(wqe, &wqe->free_list, io_work_cancel, - &data); - if (found) - goto done; - - found = io_wq_for_each_worker(wqe, &wqe->busy_list, io_work_cancel, - &data); -done: + found = io_wq_for_each_worker(wqe, io_work_cancel, &data); rcu_read_unlock(); return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; } @@ -919,14 +909,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, * completion will run normally in this case. */ rcu_read_lock(); - found = io_wq_for_each_worker(wqe, &wqe->free_list, io_wq_worker_cancel, - cwork); - if (found) - goto done; - - found = io_wq_for_each_worker(wqe, &wqe->busy_list, io_wq_worker_cancel, - cwork); -done: + found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork); rcu_read_unlock(); return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; } @@ -1030,6 +1013,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, wqe->free_list.nulls = 0; INIT_HLIST_NULLS_HEAD(&wqe->busy_list.head, 1); wqe->busy_list.nulls = 1; + INIT_LIST_HEAD(&wqe->all_list); i++; } @@ -1077,10 +1061,7 @@ void io_wq_destroy(struct io_wq *wq) if (!wqe) continue; - io_wq_for_each_worker(wqe, &wqe->free_list, io_wq_worker_wake, - NULL); - io_wq_for_each_worker(wqe, &wqe->busy_list, io_wq_worker_wake, - NULL); + io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); } rcu_read_unlock(); From 2f6d9b9d6357ede64a29437676884ee263039910 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Wed, 13 Nov 2019 18:06:24 +0800 Subject: [PATCH 60/64] io_uring: clean up io_uring_cancel_files() We don't use the return value anymore, drop it. Also drop the unecessary double cancel_req value check. Signed-off-by: Bob Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e1a3b8b667e0..297b9e80dc5c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4303,7 +4303,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, DEFINE_WAIT(wait); while (!list_empty_careful(&ctx->inflight_list)) { - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; struct io_kiocb *cancel_req = NULL; spin_lock_irq(&ctx->inflight_lock); @@ -4321,14 +4320,12 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&ctx->inflight_lock); - if (cancel_req) { - ret = io_wq_cancel_work(ctx->io_wq, &cancel_req->work); - io_put_req(cancel_req); - } - /* We need to keep going until we don't find a matching req */ if (!cancel_req) break; + + io_wq_cancel_work(ctx->io_wq, &cancel_req->work); + io_put_req(cancel_req); schedule(); } finish_wait(&ctx->inflight_wait, &wait); From 9d858b21483981db9c0cb4b184d4cdeb4bc525c2 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Wed, 13 Nov 2019 18:06:25 +0800 Subject: [PATCH 61/64] io_uring: introduce req_need_defer() Makes the code easier to read. Signed-off-by: Bob Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 297b9e80dc5c..9500780bcaea 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -448,7 +448,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return NULL; } -static inline bool __io_sequence_defer(struct io_kiocb *req) +static inline bool __req_need_defer(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -456,12 +456,12 @@ static inline bool __io_sequence_defer(struct io_kiocb *req) + atomic_read(&ctx->cached_cq_overflow); } -static inline bool io_sequence_defer(struct io_kiocb *req) +static inline bool req_need_defer(struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) - return false; + if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN) + return __req_need_defer(req); - return __io_sequence_defer(req); + return false; } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) @@ -469,7 +469,7 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); - if (req && !io_sequence_defer(req)) { + if (req && !req_need_defer(req)) { list_del_init(&req->list); return req; } @@ -482,7 +482,7 @@ static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); - if (req && !__io_sequence_defer(req)) { + if (req && !__req_need_defer(req)) { list_del_init(&req->list); return req; } @@ -2436,7 +2436,8 @@ static int io_req_defer(struct io_kiocb *req) struct io_uring_sqe *sqe_copy; struct io_ring_ctx *ctx = req->ctx; - if (!io_sequence_defer(req) && list_empty(&ctx->defer_list)) + /* Still need defer if there is pending req in defer list. */ + if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); @@ -2444,7 +2445,7 @@ static int io_req_defer(struct io_kiocb *req) return -EAGAIN; spin_lock_irq(&ctx->completion_lock); - if (!io_sequence_defer(req) && list_empty(&ctx->defer_list)) { + if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(sqe_copy); return 0; From a320e9fa1e2680116d165b9369dfa41d7cc1e1d1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 14 Nov 2019 00:11:01 +0300 Subject: [PATCH 62/64] io_uring: Fix getting file for non-fd opcodes For timeout requests and bunch of others io_uring tries to grab a file with specified fd, which is usually stdin/fd=0. Update io_op_needs_file() Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9500780bcaea..55f8b1d378df 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2599,6 +2599,10 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe) switch (op) { case IORING_OP_NOP: case IORING_OP_POLL_REMOVE: + case IORING_OP_TIMEOUT: + case IORING_OP_TIMEOUT_REMOVE: + case IORING_OP_ASYNC_CANCEL: + case IORING_OP_LINK_TIMEOUT: return false; default: return true; From 021d1cdda3875bf35edac9133335f622d7910abc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Nov 2019 08:00:41 -0700 Subject: [PATCH 63/64] io-wq: remove now redundant struct io_wq_nulls_list Since we don't iterate these lists anymore after commit: e61df66c69b1 ("io-wq: ensure free/busy list browsing see all items") we don't need to retain the nulls value we use for them. That means it's pretty pointless to wrap the hlist_nulls_head in a structure, so get rid of it. Signed-off-by: Jens Axboe --- fs/io-wq.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index fcb6c74209da..9174007ce107 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -59,11 +59,6 @@ struct io_worker { struct files_struct *restore_files; }; -struct io_wq_nulls_list { - struct hlist_nulls_head head; - unsigned long nulls; -}; - #if BITS_PER_LONG == 64 #define IO_WQ_HASH_ORDER 6 #else @@ -95,8 +90,8 @@ struct io_wqe { int node; struct io_wqe_acct acct[2]; - struct io_wq_nulls_list free_list; - struct io_wq_nulls_list busy_list; + struct hlist_nulls_head free_list; + struct hlist_nulls_head busy_list; struct list_head all_list; struct io_wq *wq; @@ -249,7 +244,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) struct hlist_nulls_node *n; struct io_worker *worker; - n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list.head)); + n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list)); if (is_a_nulls(n)) return false; @@ -325,8 +320,7 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; hlist_nulls_del_init_rcu(&worker->nulls_node); - hlist_nulls_add_head_rcu(&worker->nulls_node, - &wqe->busy_list.head); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list); } /* @@ -365,8 +359,7 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; hlist_nulls_del_init_rcu(&worker->nulls_node); - hlist_nulls_add_head_rcu(&worker->nulls_node, - &wqe->free_list.head); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); } return __io_worker_unuse(wqe, worker); @@ -592,7 +585,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) } spin_lock_irq(&wqe->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list.head); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; if (index == IO_WQ_ACCT_BOUND) @@ -617,7 +610,7 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers) return true; /* if we have available workers or no work, no need */ - if (!hlist_nulls_empty(&wqe->free_list.head) || !io_wqe_run_queue(wqe)) + if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe)) return false; return acct->nr_workers < acct->max_workers; } @@ -665,7 +658,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, return true; rcu_read_lock(); - free_worker = !hlist_nulls_empty(&wqe->free_list.head); + free_worker = !hlist_nulls_empty(&wqe->free_list); rcu_read_unlock(); if (free_worker) return true; @@ -1009,10 +1002,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, wqe->wq = wq; spin_lock_init(&wqe->lock); INIT_LIST_HEAD(&wqe->work_list); - INIT_HLIST_NULLS_HEAD(&wqe->free_list.head, 0); - wqe->free_list.nulls = 0; - INIT_HLIST_NULLS_HEAD(&wqe->busy_list.head, 1); - wqe->busy_list.nulls = 1; + INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); + INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1); INIT_LIST_HEAD(&wqe->all_list); i++; From eac406c61cd0ec8fe7970ca46ddf23e40a86b579 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Nov 2019 12:09:58 -0700 Subject: [PATCH 64/64] io_uring: make POLL_ADD/POLL_REMOVE scale better One of the obvious use cases for these commands is networking, where it's not uncommon to have tons of sockets open and polled for. The current implementation uses a list for insertion and lookup, which works fine for file based use cases where the count is usually low, it breaks down somewhat for higher number of files / sockets. A test case with 30k sockets being polled for and cancelled takes: real 0m6.968s user 0m0.002s sys 0m6.936s with the patch it takes: real 0m0.233s user 0m0.010s sys 0m0.176s If you go to 50k sockets, it gets even more abysmal with the current code: real 0m40.602s user 0m0.010s sys 0m40.555s with the patch it takes: real 0m0.398s user 0m0.000s sys 0m0.341s Change is pretty straight forward, just replace the cancel_list with a red/black tree instead. Signed-off-by: Jens Axboe --- fs/io_uring.c | 69 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 55f8b1d378df..5ad652fa24b8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -271,7 +271,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; - struct list_head cancel_list; + struct rb_root cancel_tree; spinlock_t inflight_lock; struct list_head inflight_list; @@ -323,7 +323,10 @@ struct io_kiocb { struct sqe_submit submit; struct io_ring_ctx *ctx; - struct list_head list; + union { + struct list_head list; + struct rb_node rb_node; + }; struct list_head link_list; unsigned int flags; refcount_t refs; @@ -433,7 +436,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); - INIT_LIST_HEAD(&ctx->cancel_list); + ctx->cancel_tree = RB_ROOT; INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -1934,6 +1937,14 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } +static inline void io_poll_remove_req(struct io_kiocb *req) +{ + if (!RB_EMPTY_NODE(&req->rb_node)) { + rb_erase(&req->rb_node, &req->ctx->cancel_tree); + RB_CLEAR_NODE(&req->rb_node); + } +} + static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -1945,17 +1956,17 @@ static void io_poll_remove_one(struct io_kiocb *req) io_queue_async_work(req); } spin_unlock(&poll->head->lock); - - list_del_init(&req->list); + io_poll_remove_req(req); } static void io_poll_remove_all(struct io_ring_ctx *ctx) { + struct rb_node *node; struct io_kiocb *req; spin_lock_irq(&ctx->completion_lock); - while (!list_empty(&ctx->cancel_list)) { - req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list); + while ((node = rb_first(&ctx->cancel_tree)) != NULL) { + req = rb_entry(node, struct io_kiocb, rb_node); io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); @@ -1963,13 +1974,21 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) { + struct rb_node *p, *parent = NULL; struct io_kiocb *req; - list_for_each_entry(req, &ctx->cancel_list, list) { - if (req->user_data != sqe_addr) - continue; - io_poll_remove_one(req); - return 0; + p = ctx->cancel_tree.rb_node; + while (p) { + parent = p; + req = rb_entry(parent, struct io_kiocb, rb_node); + if (sqe_addr < req->user_data) { + p = p->rb_left; + } else if (sqe_addr > req->user_data) { + p = p->rb_right; + } else { + io_poll_remove_one(req); + return 0; + } } return -ENOENT; @@ -2039,7 +2058,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) spin_unlock_irq(&ctx->completion_lock); return; } - list_del_init(&req->list); + io_poll_remove_req(req); io_poll_complete(req, mask); spin_unlock_irq(&ctx->completion_lock); @@ -2073,7 +2092,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * for finalizing the request, mark us as having grabbed that already. */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - list_del(&req->list); + io_poll_remove_req(req); io_poll_complete(req, mask); req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -2108,6 +2127,25 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->req->poll.wait); } +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct rb_node **p = &ctx->cancel_tree.rb_node; + struct rb_node *parent = NULL; + struct io_kiocb *tmp; + + while (*p) { + parent = *p; + tmp = rb_entry(parent, struct io_kiocb, rb_node); + if (req->user_data < tmp->user_data) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&req->rb_node, parent, p); + rb_insert_color(&req->rb_node, &ctx->cancel_tree); +} + static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt) { @@ -2129,6 +2167,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + RB_CLEAR_NODE(&req->rb_node); poll->head = NULL; poll->done = false; @@ -2161,7 +2200,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ - list_add_tail(&req->list, &ctx->cancel_list); + io_poll_req_insert(req); spin_unlock(&poll->head->lock); } if (mask) { /* no async, we'd stolen it */