From ba816ad61fdf31f59f423a773b00bfa2ed38243a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 28 Sep 2019 11:36:45 -0600
Subject: [PATCH 01/64] io_uring: run dependent links inline if possible

Currently any dependent link is executed from a new workqueue context,
which means that we'll be doing a context switch per link in the chain.
If we are running the completion of the current request from our async
workqueue and find that the next request is a link, then run it directly
from the workqueue context instead of forcing another switch.

This improves the performance of linked SQEs, and reduces the CPU
overhead.

Reviewed-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 160 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 113 insertions(+), 47 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c11c4157a4c2..5db0854fec74 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -666,7 +666,7 @@ static void __io_free_req(struct io_kiocb *req)
 	kmem_cache_free(req_cachep, req);
 }
 
-static void io_req_link_next(struct io_kiocb *req)
+static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 {
 	struct io_kiocb *nxt;
 
@@ -685,8 +685,16 @@ static void io_req_link_next(struct io_kiocb *req)
 		}
 
 		nxt->flags |= REQ_F_LINK_DONE;
-		INIT_WORK(&nxt->work, io_sq_wq_submit_work);
-		io_queue_async_work(req->ctx, nxt);
+		/*
+		 * If we're in async work, we can continue processing the chain
+		 * in this context instead of having to queue up new async work.
+		 */
+		if (nxtptr && current_work()) {
+			*nxtptr = nxt;
+		} else {
+			INIT_WORK(&nxt->work, io_sq_wq_submit_work);
+			io_queue_async_work(req->ctx, nxt);
+		}
 	}
 }
 
@@ -706,7 +714,7 @@ static void io_fail_links(struct io_kiocb *req)
 	}
 }
 
-static void io_free_req(struct io_kiocb *req)
+static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt)
 {
 	/*
 	 * If LINK is set, we have dependent requests in this chain. If we
@@ -718,16 +726,39 @@ static void io_free_req(struct io_kiocb *req)
 		if (req->flags & REQ_F_FAIL_LINK)
 			io_fail_links(req);
 		else
-			io_req_link_next(req);
+			io_req_link_next(req, nxt);
 	}
 
 	__io_free_req(req);
 }
 
-static void io_put_req(struct io_kiocb *req)
+/*
+ * Drop reference to request, return next in chain (if there is one) if this
+ * was the last reference to this request.
+ */
+static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 {
+	struct io_kiocb *nxt = NULL;
+
 	if (refcount_dec_and_test(&req->refs))
-		io_free_req(req);
+		io_free_req(req, &nxt);
+
+	return nxt;
+}
+
+static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr)
+{
+	struct io_kiocb *nxt;
+
+	nxt = io_put_req_find_next(req);
+	if (nxt) {
+		if (nxtptr) {
+			*nxtptr = nxt;
+		} else {
+			INIT_WORK(&nxt->work, io_sq_wq_submit_work);
+			io_queue_async_work(nxt->ctx, nxt);
+		}
+	}
 }
 
 static unsigned io_cqring_events(struct io_rings *rings)
@@ -775,7 +806,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 				if (to_free == ARRAY_SIZE(reqs))
 					io_free_req_many(ctx, reqs, &to_free);
 			} else {
-				io_free_req(req);
+				io_free_req(req, NULL);
 			}
 		}
 	}
@@ -947,7 +978,7 @@ static void kiocb_end_write(struct io_kiocb *req)
 	file_end_write(req->file);
 }
 
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_common(struct kiocb *kiocb, long res)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 
@@ -957,7 +988,22 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 	if ((req->flags & REQ_F_LINK) && res != req->result)
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req->ctx, req->user_data, res);
-	io_put_req(req);
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+	io_complete_rw_common(kiocb, res);
+	io_put_req(req, NULL);
+}
+
+static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
+{
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+	io_complete_rw_common(kiocb, res);
+	return io_put_req_find_next(req);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -1153,6 +1199,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 	}
 }
 
+static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
+		       bool in_async)
+{
+	if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw)
+		*nxt = __io_complete_rw(kiocb, ret);
+	else
+		io_rw_done(kiocb, ret);
+}
+
 static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 			   const struct io_uring_sqe *sqe,
 			   struct iov_iter *iter)
@@ -1369,7 +1424,7 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 }
 
 static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
-		   bool force_nonblock)
+		   struct io_kiocb **nxt, bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
@@ -1418,7 +1473,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 			ret2 = -EAGAIN;
 		/* Catch -EAGAIN return for forced non-blocking submission */
 		if (!force_nonblock || ret2 != -EAGAIN) {
-			io_rw_done(kiocb, ret2);
+			kiocb_done(kiocb, ret2, nxt, s->needs_lock);
 		} else {
 			/*
 			 * If ->needs_lock is true, we're already in async
@@ -1434,7 +1489,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 }
 
 static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
-		    bool force_nonblock)
+		    struct io_kiocb **nxt, bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
@@ -1492,7 +1547,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 		else
 			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
 		if (!force_nonblock || ret2 != -EAGAIN) {
-			io_rw_done(kiocb, ret2);
+			kiocb_done(kiocb, ret2, nxt, s->needs_lock);
 		} else {
 			/*
 			 * If ->needs_lock is true, we're already in async
@@ -1520,7 +1575,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 		return -EINVAL;
 
 	io_cqring_add_event(ctx, user_data, err);
-	io_put_req(req);
+	io_put_req(req, NULL);
 	return 0;
 }
 
@@ -1540,7 +1595,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 }
 
 static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		    bool force_nonblock)
+		    struct io_kiocb **nxt, bool force_nonblock)
 {
 	loff_t sqe_off = READ_ONCE(sqe->off);
 	loff_t sqe_len = READ_ONCE(sqe->len);
@@ -1567,7 +1622,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
-	io_put_req(req);
+	io_put_req(req, nxt);
 	return 0;
 }
 
@@ -1589,6 +1644,7 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static int io_sync_file_range(struct io_kiocb *req,
 			      const struct io_uring_sqe *sqe,
+			      struct io_kiocb **nxt,
 			      bool force_nonblock)
 {
 	loff_t sqe_off;
@@ -1613,13 +1669,13 @@ static int io_sync_file_range(struct io_kiocb *req,
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
-	io_put_req(req);
+	io_put_req(req, nxt);
 	return 0;
 }
 
 #if defined(CONFIG_NET)
 static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-			   bool force_nonblock,
+			   struct io_kiocb **nxt, bool force_nonblock,
 		   long (*fn)(struct socket *, struct user_msghdr __user *,
 				unsigned int))
 {
@@ -1649,26 +1705,28 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
-	io_put_req(req);
+	io_put_req(req, nxt);
 	return 0;
 }
 #endif
 
 static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		      bool force_nonblock)
+		      struct io_kiocb **nxt, bool force_nonblock)
 {
 #if defined(CONFIG_NET)
-	return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
+	return io_send_recvmsg(req, sqe, nxt, force_nonblock,
+				__sys_sendmsg_sock);
 #else
 	return -EOPNOTSUPP;
 #endif
 }
 
 static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		      bool force_nonblock)
+		      struct io_kiocb **nxt, bool force_nonblock)
 {
 #if defined(CONFIG_NET)
-	return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
+	return io_send_recvmsg(req, sqe, nxt, force_nonblock,
+				__sys_recvmsg_sock);
 #else
 	return -EOPNOTSUPP;
 #endif
@@ -1728,7 +1786,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
-	io_put_req(req);
+	io_put_req(req, NULL);
 	return 0;
 }
 
@@ -1769,7 +1827,7 @@ static void io_poll_complete_work(struct work_struct *work)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_ev_posted(ctx);
-	io_put_req(req);
+	io_put_req(req, NULL);
 }
 
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1794,7 +1852,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 		io_cqring_ev_posted(ctx);
-		io_put_req(req);
+		io_put_req(req, NULL);
 	} else {
 		io_queue_async_work(ctx, req);
 	}
@@ -1886,7 +1944,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (mask) {
 		io_cqring_ev_posted(ctx);
-		io_put_req(req);
+		io_put_req(req, NULL);
 	}
 	return ipt.error;
 }
@@ -1919,7 +1977,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 
 	io_cqring_ev_posted(ctx);
 
-	io_put_req(req);
+	io_put_req(req, NULL);
 	return HRTIMER_NORESTART;
 }
 
@@ -2028,7 +2086,8 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
 }
 
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			   const struct sqe_submit *s, bool force_nonblock)
+			   const struct sqe_submit *s, struct io_kiocb **nxt,
+			   bool force_nonblock)
 {
 	int ret, opcode;
 
@@ -2045,21 +2104,21 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_READV:
 		if (unlikely(s->sqe->buf_index))
 			return -EINVAL;
-		ret = io_read(req, s, force_nonblock);
+		ret = io_read(req, s, nxt, force_nonblock);
 		break;
 	case IORING_OP_WRITEV:
 		if (unlikely(s->sqe->buf_index))
 			return -EINVAL;
-		ret = io_write(req, s, force_nonblock);
+		ret = io_write(req, s, nxt, force_nonblock);
 		break;
 	case IORING_OP_READ_FIXED:
-		ret = io_read(req, s, force_nonblock);
+		ret = io_read(req, s, nxt, force_nonblock);
 		break;
 	case IORING_OP_WRITE_FIXED:
-		ret = io_write(req, s, force_nonblock);
+		ret = io_write(req, s, nxt, force_nonblock);
 		break;
 	case IORING_OP_FSYNC:
-		ret = io_fsync(req, s->sqe, force_nonblock);
+		ret = io_fsync(req, s->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_POLL_ADD:
 		ret = io_poll_add(req, s->sqe);
@@ -2068,13 +2127,13 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		ret = io_poll_remove(req, s->sqe);
 		break;
 	case IORING_OP_SYNC_FILE_RANGE:
-		ret = io_sync_file_range(req, s->sqe, force_nonblock);
+		ret = io_sync_file_range(req, s->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_SENDMSG:
-		ret = io_sendmsg(req, s->sqe, force_nonblock);
+		ret = io_sendmsg(req, s->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_RECVMSG:
-		ret = io_recvmsg(req, s->sqe, force_nonblock);
+		ret = io_recvmsg(req, s->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_TIMEOUT:
 		ret = io_timeout(req, s->sqe);
@@ -2141,6 +2200,7 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 		struct sqe_submit *s = &req->submit;
 		const struct io_uring_sqe *sqe = s->sqe;
 		unsigned int flags = req->flags;
+		struct io_kiocb *nxt = NULL;
 
 		/* Ensure we clear previously set non-block flag */
 		req->rw.ki_flags &= ~IOCB_NOWAIT;
@@ -2161,7 +2221,7 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 			s->has_user = cur_mm != NULL;
 			s->needs_lock = true;
 			do {
-				ret = __io_submit_sqe(ctx, req, s, false);
+				ret = __io_submit_sqe(ctx, req, s, &nxt, false);
 				/*
 				 * We can get EAGAIN for polled IO even though
 				 * we're forcing a sync submission from here,
@@ -2175,16 +2235,22 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 		}
 
 		/* drop submission reference */
-		io_put_req(req);
+		io_put_req(req, NULL);
 
 		if (ret) {
 			io_cqring_add_event(ctx, sqe->user_data, ret);
-			io_put_req(req);
+			io_put_req(req, NULL);
 		}
 
 		/* async context always use a copy of the sqe */
 		kfree(sqe);
 
+		/* if a dependent link is ready, do that as the next one */
+		if (!ret && nxt) {
+			req = nxt;
+			continue;
+		}
+
 		/* req from defer and link list needn't decrease async cnt */
 		if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
 			goto out;
@@ -2331,7 +2397,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 {
 	int ret;
 
-	ret = __io_submit_sqe(ctx, req, s, true);
+	ret = __io_submit_sqe(ctx, req, s, NULL, true);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2364,14 +2430,14 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	}
 
 	/* drop submission reference */
-	io_put_req(req);
+	io_put_req(req, NULL);
 
 	/* and drop final reference, if we failed */
 	if (ret) {
 		io_cqring_add_event(ctx, req->user_data, ret);
 		if (req->flags & REQ_F_LINK)
 			req->flags |= REQ_F_FAIL_LINK;
-		io_put_req(req);
+		io_put_req(req, NULL);
 	}
 
 	return ret;
@@ -2385,7 +2451,7 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	ret = io_req_defer(ctx, req, s->sqe);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
-			io_free_req(req);
+			io_free_req(req, NULL);
 			io_cqring_add_event(ctx, s->sqe->user_data, ret);
 		}
 		return 0;
@@ -2412,7 +2478,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	ret = io_req_defer(ctx, req, s->sqe);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
-			io_free_req(req);
+			io_free_req(req, NULL);
 			__io_free_req(shadow);
 			io_cqring_add_event(ctx, s->sqe->user_data, ret);
 			return 0;
@@ -2460,7 +2526,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	ret = io_req_set_file(ctx, s, state, req);
 	if (unlikely(ret)) {
 err_req:
-		io_free_req(req);
+		io_free_req(req, NULL);
 err:
 		io_cqring_add_event(ctx, s->sqe->user_data, ret);
 		return;

From 08a451739a9b5783f67de51e84cb6d9559bb9dc4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 3 Oct 2019 08:11:03 -0600
Subject: [PATCH 02/64] io_uring: allow sparse fixed file sets

This is in preparation for allowing updates to fixed file sets without
requiring a full unregister+register.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5db0854fec74..b85e5feb774a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2379,6 +2379,8 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 		if (unlikely(!ctx->user_files ||
 		    (unsigned) fd >= ctx->nr_user_files))
 			return -EBADF;
+		if (!ctx->user_files[fd])
+			return -EBADF;
 		req->file = ctx->user_files[fd];
 		req->flags |= REQ_F_FIXED_FILE;
 	} else {
@@ -2999,7 +3001,8 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	int i;
 
 	for (i = 0; i < ctx->nr_user_files; i++)
-		fput(ctx->user_files[i]);
+		if (ctx->user_files[i])
+			fput(ctx->user_files[i]);
 #endif
 }
 
@@ -3067,7 +3070,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 	struct sock *sk = ctx->ring_sock->sk;
 	struct scm_fp_list *fpl;
 	struct sk_buff *skb;
-	int i;
+	int i, nr_files;
 
 	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
 		unsigned long inflight = ctx->user->unix_inflight + nr;
@@ -3087,21 +3090,31 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 	}
 
 	skb->sk = sk;
-	skb->destructor = io_destruct_skb;
 
+	nr_files = 0;
 	fpl->user = get_uid(ctx->user);
 	for (i = 0; i < nr; i++) {
-		fpl->fp[i] = get_file(ctx->user_files[i + offset]);
-		unix_inflight(fpl->user, fpl->fp[i]);
+		if (!ctx->user_files[i + offset])
+			continue;
+		fpl->fp[nr_files] = get_file(ctx->user_files[i + offset]);
+		unix_inflight(fpl->user, fpl->fp[nr_files]);
+		nr_files++;
 	}
 
-	fpl->max = fpl->count = nr;
-	UNIXCB(skb).fp = fpl;
-	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
-	skb_queue_head(&sk->sk_receive_queue, skb);
+	if (nr_files) {
+		fpl->max = SCM_MAX_FD;
+		fpl->count = nr_files;
+		UNIXCB(skb).fp = fpl;
+		skb->destructor = io_destruct_skb;
+		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+		skb_queue_head(&sk->sk_receive_queue, skb);
 
-	for (i = 0; i < nr; i++)
-		fput(fpl->fp[i]);
+		for (i = 0; i < nr_files; i++)
+			fput(fpl->fp[i]);
+	} else {
+		kfree_skb(skb);
+		kfree(fpl);
+	}
 
 	return 0;
 }
@@ -3132,7 +3145,8 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 		return 0;
 
 	while (total < ctx->nr_user_files) {
-		fput(ctx->user_files[total]);
+		if (ctx->user_files[total])
+			fput(ctx->user_files[total]);
 		total++;
 	}
 
@@ -3163,10 +3177,15 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (!ctx->user_files)
 		return -ENOMEM;
 
-	for (i = 0; i < nr_args; i++) {
+	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
 		ret = -EFAULT;
 		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
 			break;
+		/* allow sparse sets */
+		if (fd == -1) {
+			ret = 0;
+			continue;
+		}
 
 		ctx->user_files[i] = fget(fd);
 
@@ -3184,13 +3203,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(ctx->user_files[i]);
 			break;
 		}
-		ctx->nr_user_files++;
 		ret = 0;
 	}
 
 	if (ret) {
 		for (i = 0; i < ctx->nr_user_files; i++)
-			fput(ctx->user_files[i]);
+			if (ctx->user_files[i])
+				fput(ctx->user_files[i]);
 
 		kfree(ctx->user_files);
 		ctx->user_files = NULL;

From c3a31e605620c279163c14068a60869ea3fda203 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 3 Oct 2019 13:59:56 -0600
Subject: [PATCH 03/64] io_uring: add support for IORING_REGISTER_FILES_UPDATE

Allows the application to remove/replace/add files to/from a file set.
Passes in a struct:

struct io_uring_files_update {
	__u32 offset;
	__s32 *fds;
};

that holds an array of fds, size of array passed in through the usual
nr_args part of the io_uring_register() system call. The logic is as
follows:

1) If ->fds[i] is -1, the existing file at i + ->offset is removed from
   the set.
2) If ->fds[i] is a valid fd, the existing file at i + ->offset is
   replaced with ->fds[i].

For case #2, is the existing file is currently empty (fd == -1), the
new fd is simply added to the array.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 175 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |   6 ++
 2 files changed, 181 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b85e5feb774a..77774abb1074 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3224,6 +3224,178 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
+static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
+{
+#if defined(CONFIG_UNIX)
+	struct file *file = ctx->user_files[index];
+	struct sock *sock = ctx->ring_sock->sk;
+	struct sk_buff_head list, *head = &sock->sk_receive_queue;
+	struct sk_buff *skb;
+	int i;
+
+	__skb_queue_head_init(&list);
+
+	/*
+	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
+	 * remove this entry and rearrange the file array.
+	 */
+	skb = skb_dequeue(head);
+	while (skb) {
+		struct scm_fp_list *fp;
+
+		fp = UNIXCB(skb).fp;
+		for (i = 0; i < fp->count; i++) {
+			int left;
+
+			if (fp->fp[i] != file)
+				continue;
+
+			unix_notinflight(fp->user, fp->fp[i]);
+			left = fp->count - 1 - i;
+			if (left) {
+				memmove(&fp->fp[i], &fp->fp[i + 1],
+						left * sizeof(struct file *));
+			}
+			fp->count--;
+			if (!fp->count) {
+				kfree_skb(skb);
+				skb = NULL;
+			} else {
+				__skb_queue_tail(&list, skb);
+			}
+			fput(file);
+			file = NULL;
+			break;
+		}
+
+		if (!file)
+			break;
+
+		__skb_queue_tail(&list, skb);
+
+		skb = skb_dequeue(head);
+	}
+
+	if (skb_peek(&list)) {
+		spin_lock_irq(&head->lock);
+		while ((skb = __skb_dequeue(&list)) != NULL)
+			__skb_queue_tail(head, skb);
+		spin_unlock_irq(&head->lock);
+	}
+#else
+	fput(ctx->user_files[index]);
+#endif
+}
+
+static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
+				int index)
+{
+#if defined(CONFIG_UNIX)
+	struct sock *sock = ctx->ring_sock->sk;
+	struct sk_buff_head *head = &sock->sk_receive_queue;
+	struct sk_buff *skb;
+
+	/*
+	 * See if we can merge this file into an existing skb SCM_RIGHTS
+	 * file set. If there's no room, fall back to allocating a new skb
+	 * and filling it in.
+	 */
+	spin_lock_irq(&head->lock);
+	skb = skb_peek(head);
+	if (skb) {
+		struct scm_fp_list *fpl = UNIXCB(skb).fp;
+
+		if (fpl->count < SCM_MAX_FD) {
+			__skb_unlink(skb, head);
+			spin_unlock_irq(&head->lock);
+			fpl->fp[fpl->count] = get_file(file);
+			unix_inflight(fpl->user, fpl->fp[fpl->count]);
+			fpl->count++;
+			spin_lock_irq(&head->lock);
+			__skb_queue_head(head, skb);
+		} else {
+			skb = NULL;
+		}
+	}
+	spin_unlock_irq(&head->lock);
+
+	if (skb) {
+		fput(file);
+		return 0;
+	}
+
+	return __io_sqe_files_scm(ctx, 1, index);
+#else
+	return 0;
+#endif
+}
+
+static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
+			       unsigned nr_args)
+{
+	struct io_uring_files_update up;
+	__s32 __user *fds;
+	int fd, i, err;
+	__u32 done;
+
+	if (!ctx->user_files)
+		return -ENXIO;
+	if (!nr_args)
+		return -EINVAL;
+	if (copy_from_user(&up, arg, sizeof(up)))
+		return -EFAULT;
+	if (check_add_overflow(up.offset, nr_args, &done))
+		return -EOVERFLOW;
+	if (done > ctx->nr_user_files)
+		return -EINVAL;
+
+	done = 0;
+	fds = (__s32 __user *) up.fds;
+	while (nr_args) {
+		err = 0;
+		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+			err = -EFAULT;
+			break;
+		}
+		i = array_index_nospec(up.offset, ctx->nr_user_files);
+		if (ctx->user_files[i]) {
+			io_sqe_file_unregister(ctx, i);
+			ctx->user_files[i] = NULL;
+		}
+		if (fd != -1) {
+			struct file *file;
+
+			file = fget(fd);
+			if (!file) {
+				err = -EBADF;
+				break;
+			}
+			/*
+			 * Don't allow io_uring instances to be registered. If
+			 * UNIX isn't enabled, then this causes a reference
+			 * cycle and this instance can never get freed. If UNIX
+			 * is enabled we'll handle it just fine, but there's
+			 * still no point in allowing a ring fd as it doesn't
+			 * support regular read/write anyway.
+			 */
+			if (file->f_op == &io_uring_fops) {
+				fput(file);
+				err = -EBADF;
+				break;
+			}
+			ctx->user_files[i] = file;
+			err = io_sqe_file_register(ctx, file, i);
+			if (err)
+				break;
+		}
+		nr_args--;
+		done++;
+		up.offset++;
+	}
+
+	return done ? done : err;
+}
+
 static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			       struct io_uring_params *p)
 {
@@ -4031,6 +4203,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_sqe_files_unregister(ctx);
 		break;
+	case IORING_REGISTER_FILES_UPDATE:
+		ret = io_sqe_files_update(ctx, arg, nr_args);
+		break;
 	case IORING_REGISTER_EVENTFD:
 		ret = -EINVAL;
 		if (nr_args != 1)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ea57526a5b89..4f532d9c0554 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -150,5 +150,11 @@ struct io_uring_params {
 #define IORING_UNREGISTER_FILES		3
 #define IORING_REGISTER_EVENTFD		4
 #define IORING_UNREGISTER_EVENTFD	5
+#define IORING_REGISTER_FILES_UPDATE	6
+
+struct io_uring_files_update {
+	__u32 offset;
+	__s32 *fds;
+};
 
 #endif

From 33a107f0a1b8df0ad925e39d8afc97bb78e0cec1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 4 Oct 2019 12:10:03 -0600
Subject: [PATCH 04/64] io_uring: allow application controlled CQ ring size

We currently size the CQ ring as twice the SQ ring, to allow some
flexibility in not overflowing the CQ ring. This is done because the
SQE life time is different than that of the IO request itself, the SQE
is consumed as soon as the kernel has seen the entry.

Certain application don't need a huge SQ ring size, since they just
submit IO in batches. But they may have a lot of requests pending, and
hence need a big CQ ring to hold them all. By allowing the application
to control the CQ ring size multiplier, we can cater to those
applications more efficiently.

If an application wants to define its own CQ ring size, it must set
IORING_SETUP_CQSIZE in the setup flags, and fill out
io_uring_params->cq_entries. The value must be a power of two.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 20 +++++++++++++++++---
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77774abb1074..bc93bdfe40e2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -76,6 +76,7 @@
 #include "internal.h"
 
 #define IORING_MAX_ENTRIES	32768
+#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
 #define IORING_MAX_FIXED_FILES	1024
 
 struct io_uring {
@@ -4049,10 +4050,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 	 * Use twice as many entries for the CQ ring. It's possible for the
 	 * application to drive a higher depth than the size of the SQ ring,
 	 * since the sqes are only used at submission time. This allows for
-	 * some flexibility in overcommitting a bit.
+	 * some flexibility in overcommitting a bit. If the application has
+	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
+	 * of CQ ring entries manually.
 	 */
 	p->sq_entries = roundup_pow_of_two(entries);
-	p->cq_entries = 2 * p->sq_entries;
+	if (p->flags & IORING_SETUP_CQSIZE) {
+		/*
+		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
+		 * to a power-of-two, if it isn't already. We do NOT impose
+		 * any cq vs sq ring sizing.
+		 */
+		if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
+			return -EINVAL;
+		p->cq_entries = roundup_pow_of_two(p->cq_entries);
+	} else {
+		p->cq_entries = 2 * p->sq_entries;
+	}
 
 	user = get_uid(current_user());
 	account_mem = !capable(CAP_IPC_LOCK);
@@ -4137,7 +4151,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	}
 
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
-			IORING_SETUP_SQ_AFF))
+			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4f532d9c0554..e0137ea6ad79 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -50,6 +50,7 @@ struct io_uring_sqe {
 #define IORING_SETUP_IOPOLL	(1U << 0)	/* io_context is polled */
 #define IORING_SETUP_SQPOLL	(1U << 1)	/* SQ poll thread */
 #define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
+#define IORING_SETUP_CQSIZE	(1U << 3)	/* app defines CQ size */
 
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1

From ba5290ccb6b57fc5e274ae46d051fba1f0ece262 Mon Sep 17 00:00:00 2001
From: Jackie Liu <liuyun01@kylinos.cn>
Date: Wed, 9 Oct 2019 09:19:59 +0800
Subject: [PATCH 05/64] io_uring: replace s->needs_lock with s->in_async

There is no function change, just to clean up the code, use s->in_async
to make the code know where it is.

Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bc93bdfe40e2..6bbca3d58941 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -271,7 +271,7 @@ struct sqe_submit {
 	unsigned short			index;
 	u32				sequence;
 	bool				has_user;
-	bool				needs_lock;
+	bool				in_async;
 	bool				needs_fixed_file;
 };
 
@@ -1474,13 +1474,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 			ret2 = -EAGAIN;
 		/* Catch -EAGAIN return for forced non-blocking submission */
 		if (!force_nonblock || ret2 != -EAGAIN) {
-			kiocb_done(kiocb, ret2, nxt, s->needs_lock);
+			kiocb_done(kiocb, ret2, nxt, s->in_async);
 		} else {
-			/*
-			 * If ->needs_lock is true, we're already in async
-			 * context.
-			 */
-			if (!s->needs_lock)
+			if (!s->in_async)
 				io_async_list_note(READ, req, iov_count);
 			ret = -EAGAIN;
 		}
@@ -1518,8 +1514,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 
 	ret = -EAGAIN;
 	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
-		/* If ->needs_lock is true, we're already in async context. */
-		if (!s->needs_lock)
+		if (!s->in_async)
 			io_async_list_note(WRITE, req, iov_count);
 		goto out_free;
 	}
@@ -1548,13 +1543,9 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 		else
 			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
 		if (!force_nonblock || ret2 != -EAGAIN) {
-			kiocb_done(kiocb, ret2, nxt, s->needs_lock);
+			kiocb_done(kiocb, ret2, nxt, s->in_async);
 		} else {
-			/*
-			 * If ->needs_lock is true, we're already in async
-			 * context.
-			 */
-			if (!s->needs_lock)
+			if (!s->in_async)
 				io_async_list_note(WRITE, req, iov_count);
 			ret = -EAGAIN;
 		}
@@ -2152,10 +2143,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			return -EAGAIN;
 
 		/* workqueue context doesn't hold uring_lock, grab it now */
-		if (s->needs_lock)
+		if (s->in_async)
 			mutex_lock(&ctx->uring_lock);
 		io_iopoll_req_issued(req);
-		if (s->needs_lock)
+		if (s->in_async)
 			mutex_unlock(&ctx->uring_lock);
 	}
 
@@ -2220,7 +2211,7 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 
 		if (!ret) {
 			s->has_user = cur_mm != NULL;
-			s->needs_lock = true;
+			s->in_async = true;
 			do {
 				ret = __io_submit_sqe(ctx, req, s, &nxt, false);
 				/*
@@ -2696,7 +2687,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 						-EFAULT);
 		} else {
 			s.has_user = has_user;
-			s.needs_lock = true;
+			s.in_async = true;
 			s.needs_fixed_file = true;
 			io_submit_sqe(ctx, &s, statep, &link);
 			submitted++;
@@ -2883,7 +2874,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 
 out:
 		s.has_user = true;
-		s.needs_lock = false;
+		s.in_async = false;
 		s.needs_fixed_file = false;
 		submit++;
 		io_submit_sqe(ctx, &s, statep, &link);

From a41525ab2e75987e809926352ebc6f1397da900e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 15 Oct 2019 16:48:15 -0600
Subject: [PATCH 06/64] io_uring: add support for absolute timeouts

This is a pretty trivial addition on top of the relative timeouts
we have now, but it's handy for ensuring tighter timing for those
that are building scheduling primitives on top of io_uring.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 17 ++++++++++++-----
 include/uapi/linux/io_uring.h |  5 +++++
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6bbca3d58941..2fc6809bc3a9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1978,13 +1978,17 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	unsigned count;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct list_head *entry;
+	enum hrtimer_mode mode;
 	struct timespec64 ts;
 	unsigned span = 0;
+	unsigned flags;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
-	    sqe->len != 1)
+	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1)
+		return -EINVAL;
+	flags = READ_ONCE(sqe->timeout_flags);
+	if (flags & ~IORING_TIMEOUT_ABS)
 		return -EINVAL;
 
 	if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
@@ -2042,10 +2046,13 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	list_add(&req->list, entry);
 	spin_unlock_irq(&ctx->completion_lock);
 
-	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	if (flags & IORING_TIMEOUT_ABS)
+		mode = HRTIMER_MODE_ABS;
+	else
+		mode = HRTIMER_MODE_REL;
+	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode);
 	req->timeout.timer.function = io_timeout_fn;
-	hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
-			HRTIMER_MODE_REL);
+	hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode);
 	return 0;
 }
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e0137ea6ad79..b402dfee5e15 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -70,6 +70,11 @@ struct io_uring_sqe {
  */
 #define IORING_FSYNC_DATASYNC	(1U << 0)
 
+/*
+ * sqe->timeout_flags
+ */
+#define IORING_TIMEOUT_ABS	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */

From 11365043e5271fea4c92189a976833da477a3a44 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 16 Oct 2019 09:08:32 -0600
Subject: [PATCH 07/64] io_uring: add support for canceling timeout requests

We might have cases where the need for a specific timeout is gone, add
support for canceling an existing timeout operation. This works like the
POLL_REMOVE command, where the application passes in the user_data of
the timeout it wishes to cancel in the sqe->addr field.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 109 ++++++++++++++++++++++++++++------
 include/uapi/linux/io_uring.h |   1 +
 2 files changed, 92 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2fc6809bc3a9..e5564cd91e9c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1944,8 +1944,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 {
 	struct io_ring_ctx *ctx;
-	struct io_kiocb *req, *prev;
+	struct io_kiocb *req;
 	unsigned long flags;
+	bool comp;
 
 	req = container_of(timer, struct io_kiocb, timeout.timer);
 	ctx = req->ctx;
@@ -1953,24 +1954,92 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 	/*
-	 * Adjust the reqs sequence before the current one because it
-	 * will consume a slot in the cq_ring and the the cq_tail pointer
-	 * will be increased, otherwise other timeout reqs may return in
-	 * advance without waiting for enough wait_nr.
+	 * We could be racing with timeout deletion. If the list is empty,
+	 * then timeout lookup already found it and will be handling it.
 	 */
-	prev = req;
-	list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
-		prev->sequence++;
-	list_del(&req->list);
+	comp = !list_empty(&req->list);
+	if (comp) {
+		struct io_kiocb *prev;
 
-	io_cqring_fill_event(ctx, req->user_data, -ETIME);
-	io_commit_cqring(ctx);
+		/*
+		 * Adjust the reqs sequence before the current one because it
+		 * will consume a slot in the cq_ring and the the cq_tail
+		 * pointer will be increased, otherwise other timeout reqs may
+		 * return in advance without waiting for enough wait_nr.
+		 */
+		prev = req;
+		list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
+			prev->sequence++;
+
+		list_del_init(&req->list);
+		io_cqring_fill_event(ctx, req->user_data, -ETIME);
+		io_commit_cqring(ctx);
+	}
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
+	if (comp) {
+		io_cqring_ev_posted(ctx);
+		io_put_req(req, NULL);
+	}
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * Remove or update an existing timeout command
+ */
+static int io_timeout_remove(struct io_kiocb *req,
+			     const struct io_uring_sqe *sqe)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *treq;
+	int ret = -ENOENT;
+	__u64 user_data;
+	unsigned flags;
+
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
+		return -EINVAL;
+	flags = READ_ONCE(sqe->timeout_flags);
+	if (flags)
+		return -EINVAL;
+
+	user_data = READ_ONCE(sqe->addr);
+	spin_lock_irq(&ctx->completion_lock);
+	list_for_each_entry(treq, &ctx->timeout_list, list) {
+		if (user_data == treq->user_data) {
+			list_del_init(&treq->list);
+			ret = 0;
+			break;
+		}
+	}
+
+	/* didn't find timeout */
+	if (ret) {
+fill_ev:
+		io_cqring_fill_event(ctx, req->user_data, ret);
+		io_commit_cqring(ctx);
+		spin_unlock_irq(&ctx->completion_lock);
+		io_cqring_ev_posted(ctx);
+		io_put_req(req, NULL);
+		return 0;
+	}
+
+	ret = hrtimer_try_to_cancel(&treq->timeout.timer);
+	if (ret == -1) {
+		ret = -EBUSY;
+		goto fill_ev;
+	}
+
+	io_cqring_fill_event(ctx, req->user_data, 0);
+	io_cqring_fill_event(ctx, treq->user_data, -ECANCELED);
+	io_commit_cqring(ctx);
+	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
 
+	io_put_req(treq, NULL);
 	io_put_req(req, NULL);
-	return HRTIMER_NORESTART;
+	return 0;
 }
 
 static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1994,6 +2063,13 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
 		return -EFAULT;
 
+	if (flags & IORING_TIMEOUT_ABS)
+		mode = HRTIMER_MODE_ABS;
+	else
+		mode = HRTIMER_MODE_REL;
+
+	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode);
+
 	/*
 	 * sqe->off holds how many events that need to occur for this
 	 * timeout event to be satisfied.
@@ -2045,12 +2121,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	req->sequence -= span;
 	list_add(&req->list, entry);
 	spin_unlock_irq(&ctx->completion_lock);
-
-	if (flags & IORING_TIMEOUT_ABS)
-		mode = HRTIMER_MODE_ABS;
-	else
-		mode = HRTIMER_MODE_REL;
-	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode);
 	req->timeout.timer.function = io_timeout_fn;
 	hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode);
 	return 0;
@@ -2137,6 +2207,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_TIMEOUT:
 		ret = io_timeout(req, s->sqe);
 		break;
+	case IORING_OP_TIMEOUT_REMOVE:
+		ret = io_timeout_remove(req, s->sqe);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b402dfee5e15..6dc5ced1c37a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -64,6 +64,7 @@ struct io_uring_sqe {
 #define IORING_OP_SENDMSG	9
 #define IORING_OP_RECVMSG	10
 #define IORING_OP_TIMEOUT	11
+#define IORING_OP_TIMEOUT_REMOVE	12
 
 /*
  * sqe->fsync_flags

From c826bd7a743f275e2b68c16d595534063b400deb Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Tue, 15 Oct 2019 19:02:01 +0200
Subject: [PATCH 08/64] io_uring: add set of tracing events

To trace io_uring activity one can get an information from workqueue and
io trace events, but looks like some parts could be hard to identify via
this approach. Making what happens inside io_uring more transparent is
important to be able to reason about many aspects of it, hence introduce
the set of tracing events.

All such events could be roughly divided into two categories:

* those, that are helping to understand correctness (from both kernel
  and an application point of view). E.g. a ring creation, file
  registration, or waiting for available CQE. Proposed approach is to
  get a pointer to an original structure of interest (ring context, or
  request), and then find relevant events. io_uring_queue_async_work
  also exposes a pointer to work_struct, to be able to track down
  corresponding workqueue events.

* those, that provide performance related information. Mostly it's about
  events that change the flow of requests, e.g. whether an async work
  was queued, or delayed due to some dependencies. Another important
  case is how io_uring optimizations (e.g. registered files) are
  utilized.

Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                   |  17 ++
 include/Kbuild                  |   1 +
 include/trace/events/io_uring.h | 349 ++++++++++++++++++++++++++++++++
 3 files changed, 367 insertions(+)
 create mode 100644 include/trace/events/io_uring.h

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e5564cd91e9c..f83465fc1ed1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -71,6 +71,9 @@
 #include <linux/sizes.h>
 #include <linux/hugetlb.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/io_uring.h>
+
 #include <uapi/linux/io_uring.h>
 
 #include "internal.h"
@@ -491,6 +494,7 @@ static inline void io_queue_async_work(struct io_ring_ctx *ctx,
 		}
 	}
 
+	trace_io_uring_queue_async_work(ctx, rw, req, &req->work, req->flags);
 	queue_work(ctx->sqo_wq[rw], &req->work);
 }
 
@@ -710,6 +714,7 @@ static void io_fail_links(struct io_kiocb *req)
 		link = list_first_entry(&req->link_list, struct io_kiocb, list);
 		list_del(&link->list);
 
+		trace_io_uring_fail_link(req, link);
 		io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
 		__io_free_req(link);
 	}
@@ -2149,6 +2154,7 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->submit.sqe = sqe_copy;
 
 	INIT_WORK(&req->work, io_sq_wq_submit_work);
+	trace_io_uring_defer(ctx, req, false);
 	list_add_tail(&req->list, &ctx->defer_list);
 	spin_unlock_irq(&ctx->completion_lock);
 	return -EIOCBQUEUED;
@@ -2410,6 +2416,8 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
 		ret = false;
 	}
 	spin_unlock(&list->lock);
+
+	trace_io_uring_add_to_prev(req, ret);
 	return ret;
 }
 
@@ -2458,6 +2466,7 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 	} else {
 		if (s->needs_fixed_file)
 			return -EBADF;
+		trace_io_uring_file_get(ctx, fd);
 		req->file = io_file_get(state, fd);
 		if (unlikely(!req->file))
 			return -EBADF;
@@ -2567,6 +2576,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	/* Insert shadow req to defer_list, blocking next IOs */
 	spin_lock_irq(&ctx->completion_lock);
+	trace_io_uring_defer(ctx, shadow, true);
 	list_add_tail(&shadow->list, &ctx->defer_list);
 	spin_unlock_irq(&ctx->completion_lock);
 
@@ -2626,6 +2636,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 
 		s->sqe = sqe_copy;
 		memcpy(&req->submit, s, sizeof(*s));
+		trace_io_uring_link(ctx, req, prev);
 		list_add_tail(&req->list, &prev->link_list);
 	} else if (s->sqe->flags & IOSQE_IO_LINK) {
 		req->flags |= REQ_F_LINK;
@@ -2769,6 +2780,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			s.has_user = has_user;
 			s.in_async = true;
 			s.needs_fixed_file = true;
+			trace_io_uring_submit_sqe(ctx, true, true);
 			io_submit_sqe(ctx, &s, statep, &link);
 			submitted++;
 		}
@@ -2957,6 +2969,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 		s.in_async = false;
 		s.needs_fixed_file = false;
 		submit++;
+		trace_io_uring_submit_sqe(ctx, true, false);
 		io_submit_sqe(ctx, &s, statep, &link);
 	}
 
@@ -3039,6 +3052,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 	ret = 0;
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
@@ -4197,6 +4211,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 		goto err;
 
 	p->features = IORING_FEAT_SINGLE_MMAP;
+	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
 	return ret;
 err:
 	io_ring_ctx_wait_and_kill(ctx);
@@ -4334,6 +4349,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
+	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
+							ctx->cq_ev_fd != NULL, ret);
 out_fput:
 	fdput(f);
 	return ret;
diff --git a/include/Kbuild b/include/Kbuild
index ffba79483cc5..61b66725d259 100644
--- a/include/Kbuild
+++ b/include/Kbuild
@@ -1028,6 +1028,7 @@ header-test-			+= trace/events/fsi_master_gpio.h
 header-test-			+= trace/events/huge_memory.h
 header-test-			+= trace/events/ib_mad.h
 header-test-			+= trace/events/ib_umad.h
+header-test-			+= trace/events/io_uring.h
 header-test-			+= trace/events/iscsi.h
 header-test-			+= trace/events/jbd2.h
 header-test-			+= trace/events/kvm.h
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
new file mode 100644
index 000000000000..c5a905fbf1da
--- /dev/null
+++ b/include/trace/events/io_uring.h
@@ -0,0 +1,349 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM io_uring
+
+#if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IO_URING_H
+
+#include <linux/tracepoint.h>
+
+/**
+ * io_uring_create - called after a new io_uring context was prepared
+ *
+ * @fd:			corresponding file descriptor
+ * @ctx:		pointer to a ring context structure
+ * @sq_entries:	actual SQ size
+ * @cq_entries:	actual CQ size
+ * @flags:		SQ ring flags, provided to io_uring_setup(2)
+ *
+ * Allows to trace io_uring creation and provide pointer to a context, that can
+ * be used later to find correlated events.
+ */
+TRACE_EVENT(io_uring_create,
+
+	TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags),
+
+	TP_ARGS(fd, ctx, sq_entries, cq_entries, flags),
+
+	TP_STRUCT__entry (
+		__field(  int,		fd			)
+		__field(  void *,	ctx			)
+		__field(  u32,		sq_entries	)
+		__field(  u32,		cq_entries	)
+		__field(  u32,		flags		)
+	),
+
+	TP_fast_assign(
+		__entry->fd			= fd;
+		__entry->ctx		= ctx;
+		__entry->sq_entries	= sq_entries;
+		__entry->cq_entries	= cq_entries;
+		__entry->flags		= flags;
+	),
+
+	TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d",
+			  __entry->ctx, __entry->fd, __entry->sq_entries,
+			  __entry->cq_entries, __entry->flags)
+);
+
+/**
+ * io_uring_register - called after a buffer/file/eventfd was succesfully
+ * 					   registered for a ring
+ *
+ * @ctx:			pointer to a ring context structure
+ * @opcode:			describes which operation to perform
+ * @nr_user_files:	number of registered files
+ * @nr_user_bufs:	number of registered buffers
+ * @cq_ev_fd:		whether eventfs registered or not
+ * @ret:			return code
+ *
+ * Allows to trace fixed files/buffers/eventfds, that could be registered to
+ * avoid an overhead of getting references to them for every operation. This
+ * event, together with io_uring_file_get, can provide a full picture of how
+ * much overhead one can reduce via fixing.
+ */
+TRACE_EVENT(io_uring_register,
+
+	TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files,
+			 unsigned nr_bufs, bool eventfd, long ret),
+
+	TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret),
+
+	TP_STRUCT__entry (
+		__field(  void *,	ctx			)
+		__field(  unsigned,	opcode		)
+		__field(  unsigned,	nr_files	)
+		__field(  unsigned,	nr_bufs		)
+		__field(  bool,		eventfd		)
+		__field(  long,		ret			)
+	),
+
+	TP_fast_assign(
+		__entry->ctx		= ctx;
+		__entry->opcode		= opcode;
+		__entry->nr_files	= nr_files;
+		__entry->nr_bufs	= nr_bufs;
+		__entry->eventfd	= eventfd;
+		__entry->ret		= ret;
+	),
+
+	TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, "
+			  "eventfd %d, ret %ld",
+			  __entry->ctx, __entry->opcode, __entry->nr_files,
+			  __entry->nr_bufs, __entry->eventfd, __entry->ret)
+);
+
+/**
+ * io_uring_file_get - called before getting references to an SQE file
+ *
+ * @ctx:	pointer to a ring context structure
+ * @fd:		SQE file descriptor
+ *
+ * Allows to trace out how often an SQE file reference is obtained, which can
+ * help figuring out if it makes sense to use fixed files, or check that fixed
+ * files are used correctly.
+ */
+TRACE_EVENT(io_uring_file_get,
+
+	TP_PROTO(void *ctx, int fd),
+
+	TP_ARGS(ctx, fd),
+
+	TP_STRUCT__entry (
+		__field(  void *,	ctx	)
+		__field(  int,		fd	)
+	),
+
+	TP_fast_assign(
+		__entry->ctx	= ctx;
+		__entry->fd		= fd;
+	),
+
+	TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd)
+);
+
+/**
+ * io_uring_queue_async_work - called before submitting a new async work
+ *
+ * @ctx:	pointer to a ring context structure
+ * @rw:		type of workqueue, normal or buffered writes
+ * @req:	pointer to a submitted request
+ * @work:	pointer to a submitted work_struct
+ *
+ * Allows to trace asynchronous work submission.
+ */
+TRACE_EVENT(io_uring_queue_async_work,
+
+	TP_PROTO(void *ctx, int rw, void * req, struct work_struct *work,
+			 unsigned int flags),
+
+	TP_ARGS(ctx, rw, req, work, flags),
+
+	TP_STRUCT__entry (
+		__field(  void *,				ctx		)
+		__field(  int,					rw		)
+		__field(  void *,				req		)
+		__field(  struct work_struct *,	work	)
+		__field(  unsigned int,			flags	)
+	),
+
+	TP_fast_assign(
+		__entry->ctx	= ctx;
+		__entry->rw		= rw;
+		__entry->req	= req;
+		__entry->work	= work;
+		__entry->flags	= flags;
+	),
+
+	TP_printk("ring %p, request %p, flags %d, %s queue, work %p",
+			  __entry->ctx, __entry->req, __entry->flags,
+			  __entry->rw ? "buffered" : "normal", __entry->work)
+);
+
+/**
+ * io_uring_defer_list - called before the io_uring work added into defer_list
+ *
+ * @ctx:	pointer to a ring context structure
+ * @req:	pointer to a deferred request
+ * @shadow: whether request is shadow or not
+ *
+ * Allows to track deferred requests, to get an insight about what requests are
+ * not started immediately.
+ */
+TRACE_EVENT(io_uring_defer,
+
+	TP_PROTO(void *ctx, void *req, bool shadow),
+
+	TP_ARGS(ctx, req, shadow),
+
+	TP_STRUCT__entry (
+		__field(  void *,	ctx		)
+		__field(  void *,	req		)
+		__field(  bool,		shadow	)
+	),
+
+	TP_fast_assign(
+		__entry->ctx	= ctx;
+		__entry->req	= req;
+		__entry->shadow	= shadow;
+	),
+
+	TP_printk("ring %p, request %p%s", __entry->ctx, __entry->req,
+			  __entry->shadow ? ", shadow": "")
+);
+
+/**
+ * io_uring_link - called before the io_uring request added into link_list of
+ * 				   another request
+ *
+ * @ctx:			pointer to a ring context structure
+ * @req:			pointer to a linked request
+ * @target_req:		pointer to a previous request, that would contain @req
+ *
+ * Allows to track linked requests, to understand dependencies between requests
+ * and how does it influence their execution flow.
+ */
+TRACE_EVENT(io_uring_link,
+
+	TP_PROTO(void *ctx, void *req, void *target_req),
+
+	TP_ARGS(ctx, req, target_req),
+
+	TP_STRUCT__entry (
+		__field(  void *,	ctx			)
+		__field(  void *,	req			)
+		__field(  void *,	target_req	)
+	),
+
+	TP_fast_assign(
+		__entry->ctx		= ctx;
+		__entry->req		= req;
+		__entry->target_req	= target_req;
+	),
+
+	TP_printk("ring %p, request %p linked after %p",
+			  __entry->ctx, __entry->req, __entry->target_req)
+);
+
+/**
+ * io_uring_add_to_prev - called after a request was added into a previously
+ * 						  submitted work
+ *
+ * @req:	pointer to a request, added to a previous
+ * @ret:	whether or not it was completed successfully
+ *
+ * Allows to track merged work, to figure out how often requests are piggy
+ * backed into other ones, changing the execution flow.
+ */
+TRACE_EVENT(io_uring_add_to_prev,
+
+	TP_PROTO(void *req, bool ret),
+
+	TP_ARGS(req, ret),
+
+	TP_STRUCT__entry (
+		__field(  void *,	req	)
+		__field(  bool,		ret	)
+	),
+
+	TP_fast_assign(
+		__entry->req	= req;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("request %p, ret %d", __entry->req, __entry->ret)
+);
+
+/**
+ * io_uring_cqring_wait - called before start waiting for an available CQE
+ *
+ * @ctx:		pointer to a ring context structure
+ * @min_events:	minimal number of events to wait for
+ *
+ * Allows to track waiting for CQE, so that we can e.g. troubleshoot
+ * situations, when an application wants to wait for an event, that never
+ * comes.
+ */
+TRACE_EVENT(io_uring_cqring_wait,
+
+	TP_PROTO(void *ctx, int min_events),
+
+	TP_ARGS(ctx, min_events),
+
+	TP_STRUCT__entry (
+		__field(  void *,	ctx			)
+		__field(  int,		min_events	)
+	),
+
+	TP_fast_assign(
+		__entry->ctx	= ctx;
+		__entry->min_events	= min_events;
+	),
+
+	TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events)
+);
+
+/**
+ * io_uring_fail_link - called before failing a linked request
+ *
+ * @req:	request, which links were cancelled
+ * @link:	cancelled link
+ *
+ * Allows to track linked requests cancellation, to see not only that some work
+ * was cancelled, but also which request was the reason.
+ */
+TRACE_EVENT(io_uring_fail_link,
+
+	TP_PROTO(void *req, void *link),
+
+	TP_ARGS(req, link),
+
+	TP_STRUCT__entry (
+		__field(  void *,	req		)
+		__field(  void *,	link	)
+	),
+
+	TP_fast_assign(
+		__entry->req	= req;
+		__entry->link	= link;
+	),
+
+	TP_printk("request %p, link %p", __entry->req, __entry->link)
+);
+
+/**
+ * io_uring_submit_sqe - called before submitting one SQE
+ *
+ * @ctx:			pointer to a ring context structure
+ * @force_nonblock:	whether a context blocking or not
+ * @sq_thread:		true if sq_thread has submitted this SQE
+ *
+ * Allows to track SQE submitting, to understand what was the source of it, SQ
+ * thread or io_uring_enter call.
+ */
+TRACE_EVENT(io_uring_submit_sqe,
+
+	TP_PROTO(void *ctx, bool force_nonblock, bool sq_thread),
+
+	TP_ARGS(ctx, force_nonblock, sq_thread),
+
+	TP_STRUCT__entry (
+		__field(  void *,	ctx				)
+		__field(  bool,		force_nonblock	)
+		__field(  bool,		sq_thread		)
+	),
+
+	TP_fast_assign(
+		__entry->ctx			= ctx;
+		__entry->force_nonblock	= force_nonblock;
+		__entry->sq_thread		= sq_thread;
+	),
+
+	TP_printk("ring %p, non block %d, sq_thread %d",
+			  __entry->ctx, __entry->force_nonblock, __entry->sq_thread)
+);
+
+#endif /* _TRACE_IO_URING_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

From fa4562280889ad372dfb1413833a8b8675721b17 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 27 Oct 2019 18:52:20 +0300
Subject: [PATCH 09/64] io_uring: remove index from sqe_submit

submit->index is used only for inbound check in submission path (i.e.
head < ctx->sq_entries). However, it always will be true, as
1. it's already validated by io_get_sqring()
2. ctx->sq_entries can't be changedd in between, because of held
ctx->uring_lock and ctx->refs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f83465fc1ed1..545efd89a1f9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -271,7 +271,6 @@ struct io_ring_ctx {
 
 struct sqe_submit {
 	const struct io_uring_sqe	*sqe;
-	unsigned short			index;
 	u32				sequence;
 	bool				has_user;
 	bool				in_async;
@@ -2168,9 +2167,6 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	req->user_data = READ_ONCE(s->sqe->user_data);
 
-	if (unlikely(s->index >= ctx->sq_entries))
-		return -EINVAL;
-
 	opcode = READ_ONCE(s->sqe->opcode);
 	switch (opcode) {
 	case IORING_OP_NOP:
@@ -2716,7 +2712,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 
 	head = READ_ONCE(sq_array[head & ctx->sq_mask]);
 	if (head < ctx->sq_entries) {
-		s->index = head;
 		s->sqe = &ctx->sq_sqes[head];
 		s->sequence = ctx->cached_sq_head;
 		ctx->cached_sq_head++;

From 95a1b3ff9a3e4ea2f26c4e802067d58831f415db Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 27 Oct 2019 23:15:41 +0300
Subject: [PATCH 10/64] io_uring: Fix mm_fault with READ/WRITE_FIXED

Commit fb5ccc98782f ("io_uring: Fix broken links with offloading")
introduced a potential performance regression with unconditionally
taking mm even for READ/WRITE_FIXED operations.

Return the logic handling it back. mm-faulted requests will go through
the generic submission path, so honoring links and drains, but will
fail further on req->has_user check.

Fixes: fb5ccc98782f ("io_uring: Fix broken links with offloading")
Cc: stable@vger.kernel.org # v5.4
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 41 +++++++++++++++++------------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 545efd89a1f9..f9eff8f62ddb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2726,13 +2726,14 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 }
 
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
-			  bool has_user, bool mm_fault)
+			  struct mm_struct **mm)
 {
 	struct io_submit_state state, *statep = NULL;
 	struct io_kiocb *link = NULL;
 	struct io_kiocb *shadow_req = NULL;
 	bool prev_was_link = false;
 	int i, submitted = 0;
+	bool mm_fault = false;
 
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, ctx, nr);
@@ -2745,6 +2746,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		if (!io_get_sqring(ctx, &s))
 			break;
 
+		if (io_sqe_needs_user(s.sqe) && !*mm) {
+			mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
+			if (!mm_fault) {
+				use_mm(ctx->sqo_mm);
+				*mm = ctx->sqo_mm;
+			}
+		}
+
 		/*
 		 * If previous wasn't linked and we have a linked command,
 		 * that's the end of the chain. Submit the previous link.
@@ -2768,17 +2777,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		}
 
 out:
-		if (unlikely(mm_fault)) {
-			io_cqring_add_event(ctx, s.sqe->user_data,
-						-EFAULT);
-		} else {
-			s.has_user = has_user;
-			s.in_async = true;
-			s.needs_fixed_file = true;
-			trace_io_uring_submit_sqe(ctx, true, true);
-			io_submit_sqe(ctx, &s, statep, &link);
-			submitted++;
-		}
+		s.has_user = *mm != NULL;
+		s.in_async = true;
+		s.needs_fixed_file = true;
+		trace_io_uring_submit_sqe(ctx, true, true);
+		io_submit_sqe(ctx, &s, statep, &link);
+		submitted++;
 	}
 
 	if (link)
@@ -2805,7 +2809,6 @@ static int io_sq_thread(void *data)
 
 	timeout = inflight = 0;
 	while (!kthread_should_park()) {
-		bool mm_fault = false;
 		unsigned int to_submit;
 
 		if (inflight) {
@@ -2890,18 +2893,8 @@ static int io_sq_thread(void *data)
 			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
 		}
 
-		/* Unless all new commands are FIXED regions, grab mm */
-		if (!cur_mm) {
-			mm_fault = !mmget_not_zero(ctx->sqo_mm);
-			if (!mm_fault) {
-				use_mm(ctx->sqo_mm);
-				cur_mm = ctx->sqo_mm;
-			}
-		}
-
 		to_submit = min(to_submit, ctx->sq_entries);
-		inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
-					   mm_fault);
+		inflight += io_submit_sqes(ctx, to_submit, &cur_mm);
 
 		/* Commit SQ ring head once we've consumed all SQEs */
 		io_commit_sqring(ctx);

From 771b53d033e8663abdf59704806aa856b236dcdb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 22 Oct 2019 10:25:58 -0600
Subject: [PATCH 11/64] io-wq: small threadpool implementation for io_uring

This adds support for io-wq, a smaller and specialized thread pool
implementation. This is meant to replace workqueues for io_uring. Among
the reasons for this addition are:

- We can assign memory context smarter and more persistently if we
  manage the life time of threads.

- We can drop various work-arounds we have in io_uring, like the
  async_list.

- We can implement hashed work insertion, to manage concurrency of
  buffered writes without needing a) an extra workqueue, or b)
  needlessly making the concurrency of said workqueue very low
  which hurts performance of multiple buffered file writers.

- We can implement cancel through signals, for cancelling
  interruptible work like read/write (or send/recv) to/from sockets.

- We need the above cancel for being able to assign and use file tables
  from a process.

- We can implement a more thorough cancel operation in general.

- We need it to move towards a syslet/threadlet model for even faster
  async execution. For that we need to take ownership of the used
  threads.

This list is just off the top of my head. Performance should be the
same, or better, at least that's what I've seen in my testing. io-wq
supports basic NUMA functionality, setting up a pool per node.

io-wq hooks up to the scheduler schedule in/out just like workqueue
and uses that to drive the need for more/less workers.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/Kconfig            |   3 +
 fs/Makefile           |   1 +
 fs/io-wq.c            | 825 ++++++++++++++++++++++++++++++++++++++++++
 fs/io-wq.h            |  55 +++
 include/linux/sched.h |   1 +
 kernel/sched/core.c   |  16 +-
 6 files changed, 897 insertions(+), 4 deletions(-)
 create mode 100644 fs/io-wq.c
 create mode 100644 fs/io-wq.h

diff --git a/fs/Kconfig b/fs/Kconfig
index 2501e6f1f965..7b623e9fc1b0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -322,4 +322,7 @@ source "fs/nls/Kconfig"
 source "fs/dlm/Kconfig"
 source "fs/unicode/Kconfig"
 
+config IO_WQ
+	bool
+
 endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 14231b4cf383..1148c555c4d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_IO_URING)		+= io_uring.o
+obj-$(CONFIG_IO_WQ)		+= io-wq.o
 obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
 obj-$(CONFIG_FS_VERITY)		+= verity/
diff --git a/fs/io-wq.c b/fs/io-wq.c
new file mode 100644
index 000000000000..37863879e987
--- /dev/null
+++ b/fs/io-wq.c
@@ -0,0 +1,825 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic worker thread pool for io_uring
+ *
+ * Copyright (C) 2019 Jens Axboe
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/sched/signal.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/rculist_nulls.h>
+
+#include "io-wq.h"
+
+#define WORKER_IDLE_TIMEOUT	(5 * HZ)
+
+enum {
+	IO_WORKER_F_UP		= 1,	/* up and active */
+	IO_WORKER_F_RUNNING	= 2,	/* account as running */
+	IO_WORKER_F_FREE	= 4,	/* worker on free list */
+	IO_WORKER_F_EXITING	= 8,	/* worker exiting */
+	IO_WORKER_F_FIXED	= 16,	/* static idle worker */
+};
+
+enum {
+	IO_WQ_BIT_EXIT		= 0,	/* wq exiting */
+	IO_WQ_BIT_CANCEL	= 1,	/* cancel work on list */
+};
+
+enum {
+	IO_WQE_FLAG_STALLED	= 1,	/* stalled on hash */
+};
+
+/*
+ * One for each thread in a wqe pool
+ */
+struct io_worker {
+	refcount_t ref;
+	unsigned flags;
+	struct hlist_nulls_node nulls_node;
+	struct task_struct *task;
+	wait_queue_head_t wait;
+	struct io_wqe *wqe;
+	struct io_wq_work *cur_work;
+
+	struct rcu_head rcu;
+	struct mm_struct *mm;
+};
+
+struct io_wq_nulls_list {
+	struct hlist_nulls_head head;
+	unsigned long nulls;
+};
+
+#if BITS_PER_LONG == 64
+#define IO_WQ_HASH_ORDER	6
+#else
+#define IO_WQ_HASH_ORDER	5
+#endif
+
+/*
+ * Per-node worker thread pool
+ */
+struct io_wqe {
+	struct {
+		spinlock_t lock;
+		struct list_head work_list;
+		unsigned long hash_map;
+		unsigned flags;
+	} ____cacheline_aligned_in_smp;
+
+	int node;
+	unsigned nr_workers;
+	unsigned max_workers;
+	atomic_t nr_running;
+
+	struct io_wq_nulls_list free_list;
+	struct io_wq_nulls_list busy_list;
+
+	struct io_wq *wq;
+};
+
+/*
+ * Per io_wq state
+  */
+struct io_wq {
+	struct io_wqe **wqes;
+	unsigned long state;
+	unsigned nr_wqes;
+
+	struct task_struct *manager;
+	struct mm_struct *mm;
+	refcount_t refs;
+	struct completion done;
+};
+
+static void io_wq_free_worker(struct rcu_head *head)
+{
+	struct io_worker *worker = container_of(head, struct io_worker, rcu);
+
+	kfree(worker);
+}
+
+static bool io_worker_get(struct io_worker *worker)
+{
+	return refcount_inc_not_zero(&worker->ref);
+}
+
+static void io_worker_release(struct io_worker *worker)
+{
+	if (refcount_dec_and_test(&worker->ref))
+		wake_up_process(worker->task);
+}
+
+/*
+ * Note: drops the wqe->lock if returning true! The caller must re-acquire
+ * the lock in that case. Some callers need to restart handling if this
+ * happens, so we can't just re-acquire the lock on behalf of the caller.
+ */
+static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
+{
+	/*
+	 * If we have an active mm, we need to drop the wq lock before unusing
+	 * it. If we do, return true and let the caller retry the idle loop.
+	 */
+	if (worker->mm) {
+		__acquire(&wqe->lock);
+		spin_unlock_irq(&wqe->lock);
+		__set_current_state(TASK_RUNNING);
+		set_fs(KERNEL_DS);
+		unuse_mm(worker->mm);
+		mmput(worker->mm);
+		worker->mm = NULL;
+		return true;
+	}
+
+	return false;
+}
+
+static void io_worker_exit(struct io_worker *worker)
+{
+	struct io_wqe *wqe = worker->wqe;
+	bool all_done = false;
+
+	/*
+	 * If we're not at zero, someone else is holding a brief reference
+	 * to the worker. Wait for that to go away.
+	 */
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (!refcount_dec_and_test(&worker->ref))
+		schedule();
+	__set_current_state(TASK_RUNNING);
+
+	preempt_disable();
+	current->flags &= ~PF_IO_WORKER;
+	if (worker->flags & IO_WORKER_F_RUNNING)
+		atomic_dec(&wqe->nr_running);
+	worker->flags = 0;
+	preempt_enable();
+
+	spin_lock_irq(&wqe->lock);
+	hlist_nulls_del_rcu(&worker->nulls_node);
+	if (__io_worker_unuse(wqe, worker)) {
+		__release(&wqe->lock);
+		spin_lock_irq(&wqe->lock);
+	}
+	wqe->nr_workers--;
+	all_done = !wqe->nr_workers;
+	spin_unlock_irq(&wqe->lock);
+
+	/* all workers gone, wq exit can proceed */
+	if (all_done && refcount_dec_and_test(&wqe->wq->refs))
+		complete(&wqe->wq->done);
+
+	call_rcu(&worker->rcu, io_wq_free_worker);
+}
+
+static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
+{
+	allow_kernel_signal(SIGINT);
+
+	current->flags |= PF_IO_WORKER;
+
+	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+	atomic_inc(&wqe->nr_running);
+}
+
+/*
+ * Worker will start processing some work. Move it to the busy list, if
+ * it's currently on the freelist
+ */
+static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
+			     struct io_wq_work *work)
+	__must_hold(wqe->lock)
+{
+	if (worker->flags & IO_WORKER_F_FREE) {
+		worker->flags &= ~IO_WORKER_F_FREE;
+		hlist_nulls_del_init_rcu(&worker->nulls_node);
+		hlist_nulls_add_head_rcu(&worker->nulls_node,
+						&wqe->busy_list.head);
+	}
+	worker->cur_work = work;
+}
+
+/*
+ * No work, worker going to sleep. Move to freelist, and unuse mm if we
+ * have one attached. Dropping the mm may potentially sleep, so we drop
+ * the lock in that case and return success. Since the caller has to
+ * retry the loop in that case (we changed task state), we don't regrab
+ * the lock if we return success.
+ */
+static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
+	__must_hold(wqe->lock)
+{
+	if (!(worker->flags & IO_WORKER_F_FREE)) {
+		worker->flags |= IO_WORKER_F_FREE;
+		hlist_nulls_del_init_rcu(&worker->nulls_node);
+		hlist_nulls_add_head_rcu(&worker->nulls_node,
+						&wqe->free_list.head);
+	}
+
+	return __io_worker_unuse(wqe, worker);
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
+	__must_hold(wqe->lock)
+{
+	struct io_wq_work *work;
+
+	list_for_each_entry(work, &wqe->work_list, list) {
+		/* not hashed, can run anytime */
+		if (!(work->flags & IO_WQ_WORK_HASHED)) {
+			list_del(&work->list);
+			return work;
+		}
+
+		/* hashed, can run if not already running */
+		*hash = work->flags >> IO_WQ_HASH_SHIFT;
+		if (!(wqe->hash_map & BIT_ULL(*hash))) {
+			wqe->hash_map |= BIT_ULL(*hash);
+			list_del(&work->list);
+			return work;
+		}
+	}
+
+	return NULL;
+}
+
+static void io_worker_handle_work(struct io_worker *worker)
+	__releases(wqe->lock)
+{
+	struct io_wq_work *work, *old_work;
+	struct io_wqe *wqe = worker->wqe;
+	struct io_wq *wq = wqe->wq;
+
+	do {
+		unsigned hash = -1U;
+
+		/*
+		 * Signals are either sent to cancel specific work, or to just
+		 * cancel all work items. For the former, ->cur_work must
+		 * match. ->cur_work is NULL at this point, since we haven't
+		 * assigned any work, so it's safe to flush signals for that
+		 * case. For the latter case of cancelling all work, the caller
+		 * wil have set IO_WQ_BIT_CANCEL.
+		 */
+		if (signal_pending(current))
+			flush_signals(current);
+
+		/*
+		 * If we got some work, mark us as busy. If we didn't, but
+		 * the list isn't empty, it means we stalled on hashed work.
+		 * Mark us stalled so we don't keep looking for work when we
+		 * can't make progress, any work completion or insertion will
+		 * clear the stalled flag.
+		 */
+		work = io_get_next_work(wqe, &hash);
+		if (work)
+			__io_worker_busy(wqe, worker, work);
+		else if (!list_empty(&wqe->work_list))
+			wqe->flags |= IO_WQE_FLAG_STALLED;
+
+		spin_unlock_irq(&wqe->lock);
+		if (!work)
+			break;
+next:
+		if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
+		    wq->mm && mmget_not_zero(wq->mm)) {
+			use_mm(wq->mm);
+			set_fs(USER_DS);
+			worker->mm = wq->mm;
+		}
+		if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
+			work->flags |= IO_WQ_WORK_CANCEL;
+		if (worker->mm)
+			work->flags |= IO_WQ_WORK_HAS_MM;
+
+		old_work = work;
+		work->func(&work);
+
+		spin_lock_irq(&wqe->lock);
+		worker->cur_work = NULL;
+		if (hash != -1U) {
+			wqe->hash_map &= ~BIT_ULL(hash);
+			wqe->flags &= ~IO_WQE_FLAG_STALLED;
+		}
+		if (work && work != old_work) {
+			spin_unlock_irq(&wqe->lock);
+			/* dependent work not hashed */
+			hash = -1U;
+			goto next;
+		}
+	} while (1);
+}
+
+static inline bool io_wqe_run_queue(struct io_wqe *wqe)
+	__must_hold(wqe->lock)
+{
+	if (!list_empty_careful(&wqe->work_list) &&
+	    !(wqe->flags & IO_WQE_FLAG_STALLED))
+		return true;
+	return false;
+}
+
+static int io_wqe_worker(void *data)
+{
+	struct io_worker *worker = data;
+	struct io_wqe *wqe = worker->wqe;
+	struct io_wq *wq = wqe->wq;
+	DEFINE_WAIT(wait);
+
+	io_worker_start(wqe, worker);
+
+	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+		prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE);
+
+		spin_lock_irq(&wqe->lock);
+		if (io_wqe_run_queue(wqe)) {
+			__set_current_state(TASK_RUNNING);
+			io_worker_handle_work(worker);
+			continue;
+		}
+		/* drops the lock on success, retry */
+		if (__io_worker_idle(wqe, worker)) {
+			__release(&wqe->lock);
+			continue;
+		}
+		spin_unlock_irq(&wqe->lock);
+		if (signal_pending(current))
+			flush_signals(current);
+		if (schedule_timeout(WORKER_IDLE_TIMEOUT))
+			continue;
+		/* timed out, exit unless we're the fixed worker */
+		if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
+		    !(worker->flags & IO_WORKER_F_FIXED))
+			break;
+	}
+
+	finish_wait(&worker->wait, &wait);
+
+	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+		spin_lock_irq(&wqe->lock);
+		if (!list_empty(&wqe->work_list))
+			io_worker_handle_work(worker);
+		else
+			spin_unlock_irq(&wqe->lock);
+	}
+
+	io_worker_exit(worker);
+	return 0;
+}
+
+/*
+ * Check head of free list for an available worker. If one isn't available,
+ * caller must wake up the wq manager to create one.
+ */
+static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
+	__must_hold(RCU)
+{
+	struct hlist_nulls_node *n;
+	struct io_worker *worker;
+
+	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list.head));
+	if (is_a_nulls(n))
+		return false;
+
+	worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
+	if (io_worker_get(worker)) {
+		wake_up(&worker->wait);
+		io_worker_release(worker);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * We need a worker. If we find a free one, we're good. If not, and we're
+ * below the max number of workers, wake up the manager to create one.
+ */
+static void io_wqe_wake_worker(struct io_wqe *wqe)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = io_wqe_activate_free_worker(wqe);
+	rcu_read_unlock();
+
+	if (!ret && wqe->nr_workers < wqe->max_workers)
+		wake_up_process(wqe->wq->manager);
+}
+
+/*
+ * Called when a worker is scheduled in. Mark us as currently running.
+ */
+void io_wq_worker_running(struct task_struct *tsk)
+{
+	struct io_worker *worker = kthread_data(tsk);
+	struct io_wqe *wqe = worker->wqe;
+
+	if (!(worker->flags & IO_WORKER_F_UP))
+		return;
+	if (worker->flags & IO_WORKER_F_RUNNING)
+		return;
+	worker->flags |= IO_WORKER_F_RUNNING;
+	atomic_inc(&wqe->nr_running);
+}
+
+/*
+ * Called when worker is going to sleep. If there are no workers currently
+ * running and we have work pending, wake up a free one or have the manager
+ * set one up.
+ */
+void io_wq_worker_sleeping(struct task_struct *tsk)
+{
+	struct io_worker *worker = kthread_data(tsk);
+	struct io_wqe *wqe = worker->wqe;
+
+	if (!(worker->flags & IO_WORKER_F_UP))
+		return;
+	if (!(worker->flags & IO_WORKER_F_RUNNING))
+		return;
+
+	worker->flags &= ~IO_WORKER_F_RUNNING;
+
+	spin_lock_irq(&wqe->lock);
+	if (atomic_dec_and_test(&wqe->nr_running) && io_wqe_run_queue(wqe))
+		io_wqe_wake_worker(wqe);
+	spin_unlock_irq(&wqe->lock);
+}
+
+static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe)
+{
+	struct io_worker *worker;
+
+	worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node);
+	if (!worker)
+		return;
+
+	refcount_set(&worker->ref, 1);
+	worker->nulls_node.pprev = NULL;
+	init_waitqueue_head(&worker->wait);
+	worker->wqe = wqe;
+
+	worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
+						"io_wqe_worker-%d", wqe->node);
+	if (IS_ERR(worker->task)) {
+		kfree(worker);
+		return;
+	}
+
+	spin_lock_irq(&wqe->lock);
+	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list.head);
+	worker->flags |= IO_WORKER_F_FREE;
+	if (!wqe->nr_workers)
+		worker->flags |= IO_WORKER_F_FIXED;
+	wqe->nr_workers++;
+	spin_unlock_irq(&wqe->lock);
+
+	wake_up_process(worker->task);
+}
+
+static inline bool io_wqe_need_new_worker(struct io_wqe *wqe)
+	__must_hold(wqe->lock)
+{
+	if (!wqe->nr_workers)
+		return true;
+	if (hlist_nulls_empty(&wqe->free_list.head) &&
+	    wqe->nr_workers < wqe->max_workers && io_wqe_run_queue(wqe))
+		return true;
+
+	return false;
+}
+
+/*
+ * Manager thread. Tasked with creating new workers, if we need them.
+ */
+static int io_wq_manager(void *data)
+{
+	struct io_wq *wq = data;
+
+	while (!kthread_should_stop()) {
+		int i;
+
+		for (i = 0; i < wq->nr_wqes; i++) {
+			struct io_wqe *wqe = wq->wqes[i];
+			bool fork_worker = false;
+
+			spin_lock_irq(&wqe->lock);
+			fork_worker = io_wqe_need_new_worker(wqe);
+			spin_unlock_irq(&wqe->lock);
+			if (fork_worker)
+				create_io_worker(wq, wqe);
+		}
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ);
+	}
+
+	return 0;
+}
+
+static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&wqe->lock, flags);
+	list_add_tail(&work->list, &wqe->work_list);
+	wqe->flags &= ~IO_WQE_FLAG_STALLED;
+	spin_unlock_irqrestore(&wqe->lock, flags);
+
+	if (!atomic_read(&wqe->nr_running))
+		io_wqe_wake_worker(wqe);
+}
+
+void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
+{
+	struct io_wqe *wqe = wq->wqes[numa_node_id()];
+
+	io_wqe_enqueue(wqe, work);
+}
+
+/*
+ * Enqueue work, hashed by some key. Work items that hash to the same value
+ * will not be done in parallel. Used to limit concurrent writes, generally
+ * hashed by inode.
+ */
+void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val)
+{
+	struct io_wqe *wqe = wq->wqes[numa_node_id()];
+	unsigned bit;
+
+
+	bit = hash_ptr(val, IO_WQ_HASH_ORDER);
+	work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
+	io_wqe_enqueue(wqe, work);
+}
+
+static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
+{
+	send_sig(SIGINT, worker->task, 1);
+	return false;
+}
+
+/*
+ * Iterate the passed in list and call the specific function for each
+ * worker that isn't exiting
+ */
+static bool io_wq_for_each_worker(struct io_wqe *wqe,
+				  struct io_wq_nulls_list *list,
+				  bool (*func)(struct io_worker *, void *),
+				  void *data)
+{
+	struct hlist_nulls_node *n;
+	struct io_worker *worker;
+	bool ret = false;
+
+restart:
+	hlist_nulls_for_each_entry_rcu(worker, n, &list->head, nulls_node) {
+		if (io_worker_get(worker)) {
+			ret = func(worker, data);
+			io_worker_release(worker);
+			if (ret)
+				break;
+		}
+	}
+	if (!ret && get_nulls_value(n) != list->nulls)
+		goto restart;
+	return ret;
+}
+
+void io_wq_cancel_all(struct io_wq *wq)
+{
+	int i;
+
+	set_bit(IO_WQ_BIT_CANCEL, &wq->state);
+
+	/*
+	 * Browse both lists, as there's a gap between handing work off
+	 * to a worker and the worker putting itself on the busy_list
+	 */
+	rcu_read_lock();
+	for (i = 0; i < wq->nr_wqes; i++) {
+		struct io_wqe *wqe = wq->wqes[i];
+
+		io_wq_for_each_worker(wqe, &wqe->busy_list,
+					io_wqe_worker_send_sig, NULL);
+		io_wq_for_each_worker(wqe, &wqe->free_list,
+					io_wqe_worker_send_sig, NULL);
+	}
+	rcu_read_unlock();
+}
+
+static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
+{
+	struct io_wq_work *work = data;
+
+	if (worker->cur_work == work) {
+		send_sig(SIGINT, worker->task, 1);
+		return true;
+	}
+
+	return false;
+}
+
+static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
+					    struct io_wq_work *cwork)
+{
+	struct io_wq_work *work;
+	bool found = false;
+
+	cwork->flags |= IO_WQ_WORK_CANCEL;
+
+	/*
+	 * First check pending list, if we're lucky we can just remove it
+	 * from there. CANCEL_OK means that the work is returned as-new,
+	 * no completion will be posted for it.
+	 */
+	spin_lock_irq(&wqe->lock);
+	list_for_each_entry(work, &wqe->work_list, list) {
+		if (work == cwork) {
+			list_del(&work->list);
+			found = true;
+			break;
+		}
+	}
+	spin_unlock_irq(&wqe->lock);
+
+	if (found) {
+		work->flags |= IO_WQ_WORK_CANCEL;
+		work->func(&work);
+		return IO_WQ_CANCEL_OK;
+	}
+
+	/*
+	 * Now check if a free (going busy) or busy worker has the work
+	 * currently running. If we find it there, we'll return CANCEL_RUNNING
+	 * as an indication that we attempte to signal cancellation. The
+	 * completion will run normally in this case.
+	 */
+	rcu_read_lock();
+	found = io_wq_for_each_worker(wqe, &wqe->free_list, io_wq_worker_cancel,
+					cwork);
+	if (found)
+		goto done;
+
+	found = io_wq_for_each_worker(wqe, &wqe->busy_list, io_wq_worker_cancel,
+					cwork);
+done:
+	rcu_read_unlock();
+	return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
+}
+
+enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
+{
+	enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
+	int i;
+
+	for (i = 0; i < wq->nr_wqes; i++) {
+		struct io_wqe *wqe = wq->wqes[i];
+
+		ret = io_wqe_cancel_work(wqe, cwork);
+		if (ret != IO_WQ_CANCEL_NOTFOUND)
+			break;
+	}
+
+	return ret;
+}
+
+struct io_wq_flush_data {
+	struct io_wq_work work;
+	struct completion done;
+};
+
+static void io_wq_flush_func(struct io_wq_work **workptr)
+{
+	struct io_wq_work *work = *workptr;
+	struct io_wq_flush_data *data;
+
+	data = container_of(work, struct io_wq_flush_data, work);
+	complete(&data->done);
+}
+
+/*
+ * Doesn't wait for previously queued work to finish. When this completes,
+ * it just means that previously queued work was started.
+ */
+void io_wq_flush(struct io_wq *wq)
+{
+	struct io_wq_flush_data data;
+	int i;
+
+	for (i = 0; i < wq->nr_wqes; i++) {
+		struct io_wqe *wqe = wq->wqes[i];
+
+		init_completion(&data.done);
+		INIT_IO_WORK(&data.work, io_wq_flush_func);
+		io_wqe_enqueue(wqe, &data.work);
+		wait_for_completion(&data.done);
+	}
+}
+
+struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm)
+{
+	int ret = -ENOMEM, i, node;
+	struct io_wq *wq;
+
+	wq = kcalloc(1, sizeof(*wq), GFP_KERNEL);
+	if (!wq)
+		return ERR_PTR(-ENOMEM);
+
+	wq->nr_wqes = num_online_nodes();
+	wq->wqes = kcalloc(wq->nr_wqes, sizeof(struct io_wqe *), GFP_KERNEL);
+	if (!wq->wqes) {
+		kfree(wq);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	i = 0;
+	refcount_set(&wq->refs, wq->nr_wqes);
+	for_each_online_node(node) {
+		struct io_wqe *wqe;
+
+		wqe = kcalloc_node(1, sizeof(struct io_wqe), GFP_KERNEL, node);
+		if (!wqe)
+			break;
+		wq->wqes[i] = wqe;
+		wqe->node = node;
+		wqe->max_workers = concurrency;
+		wqe->node = node;
+		wqe->wq = wq;
+		spin_lock_init(&wqe->lock);
+		INIT_LIST_HEAD(&wqe->work_list);
+		INIT_HLIST_NULLS_HEAD(&wqe->free_list.head, 0);
+		wqe->free_list.nulls = 0;
+		INIT_HLIST_NULLS_HEAD(&wqe->busy_list.head, 1);
+		wqe->busy_list.nulls = 1;
+		atomic_set(&wqe->nr_running, 0);
+
+		i++;
+	}
+
+	init_completion(&wq->done);
+
+	if (i != wq->nr_wqes)
+		goto err;
+
+	/* caller must have already done mmgrab() on this mm */
+	wq->mm = mm;
+
+	wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
+	if (!IS_ERR(wq->manager)) {
+		wake_up_process(wq->manager);
+		return wq;
+	}
+
+	ret = PTR_ERR(wq->manager);
+	wq->manager = NULL;
+err:
+	complete(&wq->done);
+	io_wq_destroy(wq);
+	return ERR_PTR(ret);
+}
+
+static bool io_wq_worker_wake(struct io_worker *worker, void *data)
+{
+	wake_up_process(worker->task);
+	return false;
+}
+
+void io_wq_destroy(struct io_wq *wq)
+{
+	int i;
+
+	if (wq->manager) {
+		set_bit(IO_WQ_BIT_EXIT, &wq->state);
+		kthread_stop(wq->manager);
+	}
+
+	rcu_read_lock();
+	for (i = 0; i < wq->nr_wqes; i++) {
+		struct io_wqe *wqe = wq->wqes[i];
+
+		if (!wqe)
+			continue;
+		io_wq_for_each_worker(wqe, &wqe->free_list, io_wq_worker_wake,
+						NULL);
+		io_wq_for_each_worker(wqe, &wqe->busy_list, io_wq_worker_wake,
+						NULL);
+	}
+	rcu_read_unlock();
+
+	wait_for_completion(&wq->done);
+
+	for (i = 0; i < wq->nr_wqes; i++)
+		kfree(wq->wqes[i]);
+	kfree(wq->wqes);
+	kfree(wq);
+}
diff --git a/fs/io-wq.h b/fs/io-wq.h
new file mode 100644
index 000000000000..be8f22c8937b
--- /dev/null
+++ b/fs/io-wq.h
@@ -0,0 +1,55 @@
+#ifndef INTERNAL_IO_WQ_H
+#define INTERNAL_IO_WQ_H
+
+struct io_wq;
+
+enum {
+	IO_WQ_WORK_CANCEL	= 1,
+	IO_WQ_WORK_HAS_MM	= 2,
+	IO_WQ_WORK_HASHED	= 4,
+	IO_WQ_WORK_NEEDS_USER	= 8,
+
+	IO_WQ_HASH_SHIFT	= 24,	/* upper 8 bits are used for hash key */
+};
+
+enum io_wq_cancel {
+	IO_WQ_CANCEL_OK,	/* cancelled before started */
+	IO_WQ_CANCEL_RUNNING,	/* found, running, and attempted cancelled */
+	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
+};
+
+struct io_wq_work {
+	struct list_head list;
+	void (*func)(struct io_wq_work **);
+	unsigned flags;
+};
+
+#define INIT_IO_WORK(work, _func)			\
+	do {						\
+		(work)->func = _func;			\
+		(work)->flags = 0;			\
+	} while (0)					\
+
+struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm);
+void io_wq_destroy(struct io_wq *wq);
+
+void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
+void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
+void io_wq_flush(struct io_wq *wq);
+
+void io_wq_cancel_all(struct io_wq *wq);
+enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
+
+#if defined(CONFIG_IO_WQ)
+extern void io_wq_worker_sleeping(struct task_struct *);
+extern void io_wq_worker_running(struct task_struct *);
+#else
+static inline void io_wq_worker_sleeping(struct task_struct *tsk)
+{
+}
+static inline void io_wq_worker_running(struct task_struct *tsk)
+{
+}
+#endif
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 67a1d86981a9..6666e25606b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1468,6 +1468,7 @@ extern struct pid *cad_pid;
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
+#define PF_IO_WORKER		0x20000000	/* Task is an IO worker */
 #define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dd05a378631a..a95a2f05f3b5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
 #include <asm/tlb.h>
 
 #include "../workqueue_internal.h"
+#include "../../fs/io-wq.h"
 #include "../smpboot.h"
 
 #include "pelt.h"
@@ -4103,9 +4104,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * we disable preemption to avoid it calling schedule() again
 	 * in the possible wakeup of a kworker.
 	 */
-	if (tsk->flags & PF_WQ_WORKER) {
+	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
 		preempt_disable();
-		wq_worker_sleeping(tsk);
+		if (tsk->flags & PF_WQ_WORKER)
+			wq_worker_sleeping(tsk);
+		else
+			io_wq_worker_sleeping(tsk);
 		preempt_enable_no_resched();
 	}
 
@@ -4122,8 +4126,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
 
 static void sched_update_worker(struct task_struct *tsk)
 {
-	if (tsk->flags & PF_WQ_WORKER)
-		wq_worker_running(tsk);
+	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+		if (tsk->flags & PF_WQ_WORKER)
+			wq_worker_running(tsk);
+		else
+			io_wq_worker_running(tsk);
+	}
 }
 
 asmlinkage __visible void __sched schedule(void)

From 561fb04a6a2257716738dac2ed812f377c2634c2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 24 Oct 2019 07:25:42 -0600
Subject: [PATCH 12/64] io_uring: replace workqueue usage with io-wq

Drop various work-arounds we have for workqueues:

- We no longer need the async_list for tracking sequential IO.

- We don't have to maintain our own mm tracking/setting.

- We don't need a separate workqueue for buffered writes. This didn't
  even work that well to begin with, as it was suboptimal for multiple
  buffered writers on multiple files.

- We can properly cancel pending interruptible work. This fixes
  deadlocks with particularly socket IO, where we cannot cancel them
  when the io_uring is closed. Hence the ring will wait forever for
  these requests to complete, which may never happen. This is different
  from disk IO where we know requests will complete in a finite amount
  of time.

- Due to being able to cancel work interruptible work that is already
  running, we can implement file table support for work. We need that
  for supporting system calls that add to a process file table.

- It gets us one step closer to adding async support for any system
  call.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                   | 419 ++++++++------------------------
 include/trace/events/io_uring.h |  12 +-
 init/Kconfig                    |   1 +
 3 files changed, 108 insertions(+), 324 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f9eff8f62ddb..d94bd4e3a60e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -56,7 +56,6 @@
 #include <linux/mmu_context.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
-#include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/blkdev.h>
 #include <linux/bvec.h>
@@ -77,6 +76,7 @@
 #include <uapi/linux/io_uring.h>
 
 #include "internal.h"
+#include "io-wq.h"
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@@ -165,16 +165,6 @@ struct io_mapped_ubuf {
 	unsigned int	nr_bvecs;
 };
 
-struct async_list {
-	spinlock_t		lock;
-	atomic_t		cnt;
-	struct list_head	list;
-
-	struct file		*file;
-	off_t			io_start;
-	size_t			io_len;
-};
-
 struct io_ring_ctx {
 	struct {
 		struct percpu_ref	refs;
@@ -209,7 +199,7 @@ struct io_ring_ctx {
 	} ____cacheline_aligned_in_smp;
 
 	/* IO offload */
-	struct workqueue_struct	*sqo_wq[2];
+	struct io_wq		*io_wq;
 	struct task_struct	*sqo_thread;	/* if using sq thread polling */
 	struct mm_struct	*sqo_mm;
 	wait_queue_head_t	sqo_wait;
@@ -262,8 +252,6 @@ struct io_ring_ctx {
 		struct list_head	cancel_list;
 	} ____cacheline_aligned_in_smp;
 
-	struct async_list	pending_async[2];
-
 #if defined(CONFIG_UNIX)
 	struct socket		*ring_sock;
 #endif
@@ -333,7 +321,7 @@ struct io_kiocb {
 	u32			result;
 	u32			sequence;
 
-	struct work_struct	work;
+	struct io_wq_work	work;
 };
 
 #define IO_PLUG_THRESHOLD		2
@@ -359,7 +347,7 @@ struct io_submit_state {
 	unsigned int		ios_left;
 };
 
-static void io_sq_wq_submit_work(struct work_struct *work);
+static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 				 long res);
 static void __io_free_req(struct io_kiocb *req);
@@ -391,7 +379,6 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
-	int i;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
@@ -409,11 +396,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_completion(&ctx->sqo_thread_started);
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->wait);
-	for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
-		spin_lock_init(&ctx->pending_async[i].lock);
-		INIT_LIST_HEAD(&ctx->pending_async[i].list);
-		atomic_set(&ctx->pending_async[i].cnt, 0);
-	}
 	spin_lock_init(&ctx->completion_lock);
 	INIT_LIST_HEAD(&ctx->poll_list);
 	INIT_LIST_HEAD(&ctx->cancel_list);
@@ -479,22 +461,45 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
-static inline void io_queue_async_work(struct io_ring_ctx *ctx,
-				       struct io_kiocb *req)
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
 {
-	int rw = 0;
+	u8 opcode = READ_ONCE(sqe->opcode);
+
+	return !(opcode == IORING_OP_READ_FIXED ||
+		 opcode == IORING_OP_WRITE_FIXED);
+}
+
+static inline bool io_prep_async_work(struct io_kiocb *req)
+{
+	bool do_hashed = false;
 
 	if (req->submit.sqe) {
 		switch (req->submit.sqe->opcode) {
 		case IORING_OP_WRITEV:
 		case IORING_OP_WRITE_FIXED:
-			rw = !(req->rw.ki_flags & IOCB_DIRECT);
+			do_hashed = true;
 			break;
 		}
+		if (io_sqe_needs_user(req->submit.sqe))
+			req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 	}
 
-	trace_io_uring_queue_async_work(ctx, rw, req, &req->work, req->flags);
-	queue_work(ctx->sqo_wq[rw], &req->work);
+	return do_hashed;
+}
+
+static inline void io_queue_async_work(struct io_ring_ctx *ctx,
+				       struct io_kiocb *req)
+{
+	bool do_hashed = io_prep_async_work(req);
+
+	trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
+					req->flags);
+	if (!do_hashed) {
+		io_wq_enqueue(ctx->io_wq, &req->work);
+	} else {
+		io_wq_enqueue_hashed(ctx->io_wq, &req->work,
+					file_inode(req->file));
+	}
 }
 
 static void io_kill_timeout(struct io_kiocb *req)
@@ -647,6 +652,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	/* one is dropped after submission, the other at completion */
 	refcount_set(&req->refs, 2);
 	req->result = 0;
+	INIT_IO_WORK(&req->work, io_wq_submit_work);
 	return req;
 out:
 	percpu_ref_put(&ctx->refs);
@@ -693,12 +699,10 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 		 * If we're in async work, we can continue processing the chain
 		 * in this context instead of having to queue up new async work.
 		 */
-		if (nxtptr && current_work()) {
+		if (nxtptr && current_work())
 			*nxtptr = nxt;
-		} else {
-			INIT_WORK(&nxt->work, io_sq_wq_submit_work);
+		else
 			io_queue_async_work(req->ctx, nxt);
-		}
 	}
 }
 
@@ -757,12 +761,10 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr)
 
 	nxt = io_put_req_find_next(req);
 	if (nxt) {
-		if (nxtptr) {
+		if (nxtptr)
 			*nxtptr = nxt;
-		} else {
-			INIT_WORK(&nxt->work, io_sq_wq_submit_work);
+		else
 			io_queue_async_work(nxt->ctx, nxt);
-		}
 	}
 }
 
@@ -1324,65 +1326,6 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
 }
 
-static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
-{
-	if (al->file == kiocb->ki_filp) {
-		off_t start, end;
-
-		/*
-		 * Allow merging if we're anywhere in the range of the same
-		 * page. Generally this happens for sub-page reads or writes,
-		 * and it's beneficial to allow the first worker to bring the
-		 * page in and the piggy backed work can then work on the
-		 * cached page.
-		 */
-		start = al->io_start & PAGE_MASK;
-		end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
-		if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
-			return true;
-	}
-
-	al->file = NULL;
-	return false;
-}
-
-/*
- * Make a note of the last file/offset/direction we punted to async
- * context. We'll use this information to see if we can piggy back a
- * sequential request onto the previous one, if it's still hasn't been
- * completed by the async worker.
- */
-static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
-{
-	struct async_list *async_list = &req->ctx->pending_async[rw];
-	struct kiocb *kiocb = &req->rw;
-	struct file *filp = kiocb->ki_filp;
-
-	if (io_should_merge(async_list, kiocb)) {
-		unsigned long max_bytes;
-
-		/* Use 8x RA size as a decent limiter for both reads/writes */
-		max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
-		if (!max_bytes)
-			max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
-
-		/* If max len are exceeded, reset the state */
-		if (async_list->io_len + len <= max_bytes) {
-			req->flags |= REQ_F_SEQ_PREV;
-			async_list->io_len += len;
-		} else {
-			async_list->file = NULL;
-		}
-	}
-
-	/* New file? Reset state. */
-	if (async_list->file != filp) {
-		async_list->io_start = kiocb->ki_pos;
-		async_list->io_len = len;
-		async_list->file = filp;
-	}
-}
-
 /*
  * For files that don't have ->read_iter() and ->write_iter(), handle them
  * by looping over ->read() or ->write() manually.
@@ -1477,13 +1420,10 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 		    ret2 > 0 && ret2 < read_size)
 			ret2 = -EAGAIN;
 		/* Catch -EAGAIN return for forced non-blocking submission */
-		if (!force_nonblock || ret2 != -EAGAIN) {
+		if (!force_nonblock || ret2 != -EAGAIN)
 			kiocb_done(kiocb, ret2, nxt, s->in_async);
-		} else {
-			if (!s->in_async)
-				io_async_list_note(READ, req, iov_count);
+		else
 			ret = -EAGAIN;
-		}
 	}
 	kfree(iovec);
 	return ret;
@@ -1517,11 +1457,8 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	iov_count = iov_iter_count(&iter);
 
 	ret = -EAGAIN;
-	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
-		if (!s->in_async)
-			io_async_list_note(WRITE, req, iov_count);
+	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
 		goto out_free;
-	}
 
 	ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
 	if (!ret) {
@@ -1546,13 +1483,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 			ret2 = call_write_iter(file, kiocb, &iter);
 		else
 			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
-		if (!force_nonblock || ret2 != -EAGAIN) {
+		if (!force_nonblock || ret2 != -EAGAIN)
 			kiocb_done(kiocb, ret2, nxt, s->in_async);
-		} else {
-			if (!s->in_async)
-				io_async_list_note(WRITE, req, iov_count);
+		else
 			ret = -EAGAIN;
-		}
 	}
 out_free:
 	kfree(iovec);
@@ -1794,14 +1728,18 @@ static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	io_commit_cqring(ctx);
 }
 
-static void io_poll_complete_work(struct work_struct *work)
+static void io_poll_complete_work(struct io_wq_work **workptr)
 {
+	struct io_wq_work *work = *workptr;
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_poll_iocb *poll = &req->poll;
 	struct poll_table_struct pt = { ._key = poll->events };
 	struct io_ring_ctx *ctx = req->ctx;
 	__poll_t mask = 0;
 
+	if (work->flags & IO_WQ_WORK_CANCEL)
+		WRITE_ONCE(poll->canceled, true);
+
 	if (!READ_ONCE(poll->canceled))
 		mask = vfs_poll(poll->file, &pt) & poll->events;
 
@@ -1894,7 +1832,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EBADF;
 
 	req->submit.sqe = NULL;
-	INIT_WORK(&req->work, io_poll_complete_work);
+	INIT_IO_WORK(&req->work, io_poll_complete_work);
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
 
@@ -2152,7 +2090,6 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
 	req->submit.sqe = sqe_copy;
 
-	INIT_WORK(&req->work, io_sq_wq_submit_work);
 	trace_io_uring_defer(ctx, req, false);
 	list_add_tail(&req->list, &ctx->defer_list);
 	spin_unlock_irq(&ctx->completion_lock);
@@ -2235,188 +2172,56 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	return 0;
 }
 
-static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
-						 const struct io_uring_sqe *sqe)
-{
-	switch (sqe->opcode) {
-	case IORING_OP_READV:
-	case IORING_OP_READ_FIXED:
-		return &ctx->pending_async[READ];
-	case IORING_OP_WRITEV:
-	case IORING_OP_WRITE_FIXED:
-		return &ctx->pending_async[WRITE];
-	default:
-		return NULL;
-	}
-}
-
-static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
-{
-	u8 opcode = READ_ONCE(sqe->opcode);
-
-	return !(opcode == IORING_OP_READ_FIXED ||
-		 opcode == IORING_OP_WRITE_FIXED);
-}
-
-static void io_sq_wq_submit_work(struct work_struct *work)
+static void io_wq_submit_work(struct io_wq_work **workptr)
 {
+	struct io_wq_work *work = *workptr;
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct mm_struct *cur_mm = NULL;
-	struct async_list *async_list;
-	LIST_HEAD(req_list);
-	mm_segment_t old_fs;
-	int ret;
+	struct sqe_submit *s = &req->submit;
+	const struct io_uring_sqe *sqe = s->sqe;
+	struct io_kiocb *nxt = NULL;
+	int ret = 0;
 
-	async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
-restart:
-	do {
-		struct sqe_submit *s = &req->submit;
-		const struct io_uring_sqe *sqe = s->sqe;
-		unsigned int flags = req->flags;
-		struct io_kiocb *nxt = NULL;
+	/* Ensure we clear previously set non-block flag */
+	req->rw.ki_flags &= ~IOCB_NOWAIT;
 
-		/* Ensure we clear previously set non-block flag */
-		req->rw.ki_flags &= ~IOCB_NOWAIT;
+	if (work->flags & IO_WQ_WORK_CANCEL)
+		ret = -ECANCELED;
 
-		ret = 0;
-		if (io_sqe_needs_user(sqe) && !cur_mm) {
-			if (!mmget_not_zero(ctx->sqo_mm)) {
-				ret = -EFAULT;
-			} else {
-				cur_mm = ctx->sqo_mm;
-				use_mm(cur_mm);
-				old_fs = get_fs();
-				set_fs(USER_DS);
-			}
-		}
+	if (!ret) {
+		s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
+		s->in_async = true;
+		do {
+			ret = __io_submit_sqe(ctx, req, s, &nxt, false);
+			/*
+			 * We can get EAGAIN for polled IO even though we're
+			 * forcing a sync submission from here, since we can't
+			 * wait for request slots on the block side.
+			 */
+			if (ret != -EAGAIN)
+				break;
+			cond_resched();
+		} while (1);
+	}
 
-		if (!ret) {
-			s->has_user = cur_mm != NULL;
-			s->in_async = true;
-			do {
-				ret = __io_submit_sqe(ctx, req, s, &nxt, false);
-				/*
-				 * We can get EAGAIN for polled IO even though
-				 * we're forcing a sync submission from here,
-				 * since we can't wait for request slots on the
-				 * block side.
-				 */
-				if (ret != -EAGAIN)
-					break;
-				cond_resched();
-			} while (1);
-		}
+	/* drop submission reference */
+	io_put_req(req, NULL);
 
-		/* drop submission reference */
+	if (ret) {
+		io_cqring_add_event(ctx, sqe->user_data, ret);
 		io_put_req(req, NULL);
-
-		if (ret) {
-			io_cqring_add_event(ctx, sqe->user_data, ret);
-			io_put_req(req, NULL);
-		}
-
-		/* async context always use a copy of the sqe */
-		kfree(sqe);
-
-		/* if a dependent link is ready, do that as the next one */
-		if (!ret && nxt) {
-			req = nxt;
-			continue;
-		}
-
-		/* req from defer and link list needn't decrease async cnt */
-		if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
-			goto out;
-
-		if (!async_list)
-			break;
-		if (!list_empty(&req_list)) {
-			req = list_first_entry(&req_list, struct io_kiocb,
-						list);
-			list_del(&req->list);
-			continue;
-		}
-		if (list_empty(&async_list->list))
-			break;
-
-		req = NULL;
-		spin_lock(&async_list->lock);
-		if (list_empty(&async_list->list)) {
-			spin_unlock(&async_list->lock);
-			break;
-		}
-		list_splice_init(&async_list->list, &req_list);
-		spin_unlock(&async_list->lock);
-
-		req = list_first_entry(&req_list, struct io_kiocb, list);
-		list_del(&req->list);
-	} while (req);
-
-	/*
-	 * Rare case of racing with a submitter. If we find the count has
-	 * dropped to zero AND we have pending work items, then restart
-	 * the processing. This is a tiny race window.
-	 */
-	if (async_list) {
-		ret = atomic_dec_return(&async_list->cnt);
-		while (!ret && !list_empty(&async_list->list)) {
-			spin_lock(&async_list->lock);
-			atomic_inc(&async_list->cnt);
-			list_splice_init(&async_list->list, &req_list);
-			spin_unlock(&async_list->lock);
-
-			if (!list_empty(&req_list)) {
-				req = list_first_entry(&req_list,
-							struct io_kiocb, list);
-				list_del(&req->list);
-				goto restart;
-			}
-			ret = atomic_dec_return(&async_list->cnt);
-		}
 	}
 
-out:
-	if (cur_mm) {
-		set_fs(old_fs);
-		unuse_mm(cur_mm);
-		mmput(cur_mm);
+	/* async context always use a copy of the sqe */
+	kfree(sqe);
+
+	/* if a dependent link is ready, pass it back */
+	if (!ret && nxt) {
+		io_prep_async_work(nxt);
+		*workptr = &nxt->work;
 	}
 }
 
-/*
- * See if we can piggy back onto previously submitted work, that is still
- * running. We currently only allow this if the new request is sequential
- * to the previous one we punted.
- */
-static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
-{
-	bool ret;
-
-	if (!list)
-		return false;
-	if (!(req->flags & REQ_F_SEQ_PREV))
-		return false;
-	if (!atomic_read(&list->cnt))
-		return false;
-
-	ret = true;
-	spin_lock(&list->lock);
-	list_add_tail(&req->list, &list->list);
-	/*
-	 * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
-	 */
-	smp_mb();
-	if (!atomic_read(&list->cnt)) {
-		list_del_init(&req->list);
-		ret = false;
-	}
-	spin_unlock(&list->lock);
-
-	trace_io_uring_add_to_prev(req, ret);
-	return ret;
-}
-
 static bool io_op_needs_file(const struct io_uring_sqe *sqe)
 {
 	int op = READ_ONCE(sqe->opcode);
@@ -2488,17 +2293,9 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
 		if (sqe_copy) {
-			struct async_list *list;
-
 			s->sqe = sqe_copy;
 			memcpy(&req->submit, s, sizeof(*s));
-			list = io_async_list_from_sqe(ctx, s->sqe);
-			if (!io_add_to_prev_work(list, req)) {
-				if (list)
-					atomic_inc(&list->cnt);
-				INIT_WORK(&req->work, io_sq_wq_submit_work);
-				io_queue_async_work(ctx, req);
-			}
+			io_queue_async_work(ctx, req);
 
 			/*
 			 * Queued up for async execution, worker will release
@@ -3109,15 +2906,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
 
 static void io_finish_async(struct io_ring_ctx *ctx)
 {
-	int i;
-
 	io_sq_thread_stop(ctx);
 
-	for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
-		if (ctx->sqo_wq[i]) {
-			destroy_workqueue(ctx->sqo_wq[i]);
-			ctx->sqo_wq[i] = NULL;
-		}
+	if (ctx->io_wq) {
+		io_wq_destroy(ctx->io_wq);
+		ctx->io_wq = NULL;
 	}
 }
 
@@ -3125,11 +2918,9 @@ static void io_finish_async(struct io_ring_ctx *ctx)
 static void io_destruct_skb(struct sk_buff *skb)
 {
 	struct io_ring_ctx *ctx = skb->sk->sk_user_data;
-	int i;
 
-	for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
-		if (ctx->sqo_wq[i])
-			flush_workqueue(ctx->sqo_wq[i]);
+	if (ctx->io_wq)
+		io_wq_flush(ctx->io_wq);
 
 	unix_destruct_scm(skb);
 }
@@ -3473,6 +3264,7 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			       struct io_uring_params *p)
 {
+	unsigned concurrency;
 	int ret;
 
 	init_waitqueue_head(&ctx->sqo_wait);
@@ -3516,25 +3308,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 		goto err;
 	}
 
-	/* Do QD, or 2 * CPUS, whatever is smallest */
-	ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
-			WQ_UNBOUND | WQ_FREEZABLE,
-			min(ctx->sq_entries - 1, 2 * num_online_cpus()));
-	if (!ctx->sqo_wq[0]) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	/*
-	 * This is for buffered writes, where we want to limit the parallelism
-	 * due to file locking in file systems. As "normal" buffered writes
-	 * should parellelize on writeout quite nicely, limit us to having 2
-	 * pending. This avoids massive contention on the inode when doing
-	 * buffered async writes.
-	 */
-	ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
-						WQ_UNBOUND | WQ_FREEZABLE, 2);
-	if (!ctx->sqo_wq[1]) {
+	/* Do QD, or 4 * CPUS, whatever is smallest */
+	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
+	ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm);
+	if (!ctx->io_wq) {
 		ret = -ENOMEM;
 		goto err;
 	}
@@ -3919,6 +3696,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 
 	io_kill_timeouts(ctx);
 	io_poll_remove_all(ctx);
+
+	if (ctx->io_wq)
+		io_wq_cancel_all(ctx->io_wq);
+
 	io_iopoll_reap_events(ctx);
 	wait_for_completion(&ctx->ctx_done);
 	io_ring_ctx_free(ctx);
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index c5a905fbf1da..b85255121b98 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -7,6 +7,8 @@
 
 #include <linux/tracepoint.h>
 
+struct io_wq_work;
+
 /**
  * io_uring_create - called after a new io_uring context was prepared
  *
@@ -126,15 +128,15 @@ TRACE_EVENT(io_uring_file_get,
  * io_uring_queue_async_work - called before submitting a new async work
  *
  * @ctx:	pointer to a ring context structure
- * @rw:		type of workqueue, normal or buffered writes
+ * @hashed:	type of workqueue, hashed or normal
  * @req:	pointer to a submitted request
- * @work:	pointer to a submitted work_struct
+ * @work:	pointer to a submitted io_wq_work
  *
  * Allows to trace asynchronous work submission.
  */
 TRACE_EVENT(io_uring_queue_async_work,
 
-	TP_PROTO(void *ctx, int rw, void * req, struct work_struct *work,
+	TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work,
 			 unsigned int flags),
 
 	TP_ARGS(ctx, rw, req, work, flags),
@@ -143,7 +145,7 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__field(  void *,				ctx		)
 		__field(  int,					rw		)
 		__field(  void *,				req		)
-		__field(  struct work_struct *,	work	)
+		__field(  struct io_wq_work *,		work	)
 		__field(  unsigned int,			flags	)
 	),
 
@@ -157,7 +159,7 @@ TRACE_EVENT(io_uring_queue_async_work,
 
 	TP_printk("ring %p, request %p, flags %d, %s queue, work %p",
 			  __entry->ctx, __entry->req, __entry->flags,
-			  __entry->rw ? "buffered" : "normal", __entry->work)
+			  __entry->rw ? "hashed" : "normal", __entry->work)
 );
 
 /**
diff --git a/init/Kconfig b/init/Kconfig
index b4daad2bac23..4d8d145c41d2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1548,6 +1548,7 @@ config AIO
 config IO_URING
 	bool "Enable IO uring support" if EXPERT
 	select ANON_INODES
+	select IO_WQ
 	default y
 	help
 	  This option enables support for the io_uring interface, enabling

From fcb323cc53e29d9cc696d606bb42736b32dd9825 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 24 Oct 2019 12:39:47 -0600
Subject: [PATCH 13/64] io_uring: io_uring: add support for async work
 inheriting files

This is in preparation for adding opcodes that need to add new files
in a process file table, system calls like open(2) or accept4(2).

If an opcode needs this, it must set IO_WQ_WORK_NEEDS_FILES in the work
item. If work that needs to get punted to async context have this
set, the async worker will assume the original task file table before
executing the work.

Note that opcodes that need access to the current files of an
application cannot be done through IORING_SETUP_SQPOLL.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    |  30 +++++++++++--
 fs/io-wq.h    |   3 ++
 fs/io_uring.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 141 insertions(+), 8 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 37863879e987..253c04a40db5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -52,6 +52,7 @@ struct io_worker {
 
 	struct rcu_head rcu;
 	struct mm_struct *mm;
+	struct files_struct *restore_files;
 };
 
 struct io_wq_nulls_list {
@@ -126,22 +127,36 @@ static void io_worker_release(struct io_worker *worker)
  */
 static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
 {
+	bool dropped_lock = false;
+
+	if (current->files != worker->restore_files) {
+		__acquire(&wqe->lock);
+		spin_unlock_irq(&wqe->lock);
+		dropped_lock = true;
+
+		task_lock(current);
+		current->files = worker->restore_files;
+		task_unlock(current);
+	}
+
 	/*
 	 * If we have an active mm, we need to drop the wq lock before unusing
 	 * it. If we do, return true and let the caller retry the idle loop.
 	 */
 	if (worker->mm) {
-		__acquire(&wqe->lock);
-		spin_unlock_irq(&wqe->lock);
+		if (!dropped_lock) {
+			__acquire(&wqe->lock);
+			spin_unlock_irq(&wqe->lock);
+			dropped_lock = true;
+		}
 		__set_current_state(TASK_RUNNING);
 		set_fs(KERNEL_DS);
 		unuse_mm(worker->mm);
 		mmput(worker->mm);
 		worker->mm = NULL;
-		return true;
 	}
 
-	return false;
+	return dropped_lock;
 }
 
 static void io_worker_exit(struct io_worker *worker)
@@ -189,6 +204,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
 	current->flags |= PF_IO_WORKER;
 
 	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+	worker->restore_files = current->files;
 	atomic_inc(&wqe->nr_running);
 }
 
@@ -291,6 +307,12 @@ static void io_worker_handle_work(struct io_worker *worker)
 		if (!work)
 			break;
 next:
+		if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
+		    current->files != work->files) {
+			task_lock(current);
+			current->files = work->files;
+			task_unlock(current);
+		}
 		if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
 		    wq->mm && mmget_not_zero(wq->mm)) {
 			use_mm(wq->mm);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index be8f22c8937b..e93f764b1fa4 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -8,6 +8,7 @@ enum {
 	IO_WQ_WORK_HAS_MM	= 2,
 	IO_WQ_WORK_HASHED	= 4,
 	IO_WQ_WORK_NEEDS_USER	= 8,
+	IO_WQ_WORK_NEEDS_FILES	= 16,
 
 	IO_WQ_HASH_SHIFT	= 24,	/* upper 8 bits are used for hash key */
 };
@@ -22,12 +23,14 @@ struct io_wq_work {
 	struct list_head list;
 	void (*func)(struct io_wq_work **);
 	unsigned flags;
+	struct files_struct *files;
 };
 
 #define INIT_IO_WORK(work, _func)			\
 	do {						\
 		(work)->func = _func;			\
 		(work)->flags = 0;			\
+		(work)->files = NULL;			\
 	} while (0)					\
 
 struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d94bd4e3a60e..6e1523567920 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -196,6 +196,8 @@ struct io_ring_ctx {
 
 		struct list_head	defer_list;
 		struct list_head	timeout_list;
+
+		wait_queue_head_t	inflight_wait;
 	} ____cacheline_aligned_in_smp;
 
 	/* IO offload */
@@ -250,6 +252,9 @@ struct io_ring_ctx {
 		 */
 		struct list_head	poll_list;
 		struct list_head	cancel_list;
+
+		spinlock_t		inflight_lock;
+		struct list_head	inflight_list;
 	} ____cacheline_aligned_in_smp;
 
 #if defined(CONFIG_UNIX)
@@ -259,6 +264,8 @@ struct io_ring_ctx {
 
 struct sqe_submit {
 	const struct io_uring_sqe	*sqe;
+	struct file			*ring_file;
+	int				ring_fd;
 	u32				sequence;
 	bool				has_user;
 	bool				in_async;
@@ -317,10 +324,13 @@ struct io_kiocb {
 #define REQ_F_TIMEOUT		1024	/* timeout request */
 #define REQ_F_ISREG		2048	/* regular file */
 #define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
+#define REQ_F_INFLIGHT		8192	/* on inflight list */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
 
+	struct list_head	inflight_entry;
+
 	struct io_wq_work	work;
 };
 
@@ -401,6 +411,9 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->cancel_list);
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
+	init_waitqueue_head(&ctx->inflight_wait);
+	spin_lock_init(&ctx->inflight_lock);
+	INIT_LIST_HEAD(&ctx->inflight_list);
 	return ctx;
 }
 
@@ -670,9 +683,20 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 
 static void __io_free_req(struct io_kiocb *req)
 {
+	struct io_ring_ctx *ctx = req->ctx;
+
 	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 		fput(req->file);
-	percpu_ref_put(&req->ctx->refs);
+	if (req->flags & REQ_F_INFLIGHT) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&ctx->inflight_lock, flags);
+		list_del(&req->inflight_entry);
+		if (waitqueue_active(&ctx->inflight_wait))
+			wake_up(&ctx->inflight_wait);
+		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+	}
+	percpu_ref_put(&ctx->refs);
 	kmem_cache_free(req_cachep, req);
 }
 
@@ -2276,6 +2300,30 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 	return 0;
 }
 
+static int io_grab_files(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	int ret = -EBADF;
+
+	rcu_read_lock();
+	spin_lock_irq(&ctx->inflight_lock);
+	/*
+	 * We use the f_ops->flush() handler to ensure that we can flush
+	 * out work accessing these files if the fd is closed. Check if
+	 * the fd has changed since we started down this path, and disallow
+	 * this operation if it has.
+	 */
+	if (fcheck(req->submit.ring_fd) == req->submit.ring_file) {
+		list_add(&req->inflight_entry, &ctx->inflight_list);
+		req->flags |= REQ_F_INFLIGHT;
+		req->work.files = current->files;
+		ret = 0;
+	}
+	spin_unlock_irq(&ctx->inflight_lock);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			struct sqe_submit *s)
 {
@@ -2295,17 +2343,25 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		if (sqe_copy) {
 			s->sqe = sqe_copy;
 			memcpy(&req->submit, s, sizeof(*s));
-			io_queue_async_work(ctx, req);
+			if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
+				ret = io_grab_files(ctx, req);
+				if (ret) {
+					kfree(sqe_copy);
+					goto err;
+				}
+			}
 
 			/*
 			 * Queued up for async execution, worker will release
 			 * submit reference when the iocb is actually submitted.
 			 */
+			io_queue_async_work(ctx, req);
 			return 0;
 		}
 	}
 
 	/* drop submission reference */
+err:
 	io_put_req(req, NULL);
 
 	/* and drop final reference, if we failed */
@@ -2509,6 +2565,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 
 	head = READ_ONCE(sq_array[head & ctx->sq_mask]);
 	if (head < ctx->sq_entries) {
+		s->ring_file = NULL;
 		s->sqe = &ctx->sq_sqes[head];
 		s->sequence = ctx->cached_sq_head;
 		ctx->cached_sq_head++;
@@ -2708,7 +2765,8 @@ static int io_sq_thread(void *data)
 	return 0;
 }
 
-static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
+static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
+			  struct file *ring_file, int ring_fd)
 {
 	struct io_submit_state state, *statep = NULL;
 	struct io_kiocb *link = NULL;
@@ -2750,9 +2808,11 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 		}
 
 out:
+		s.ring_file = ring_file;
 		s.has_user = true;
 		s.in_async = false;
 		s.needs_fixed_file = false;
+		s.ring_fd = ring_fd;
 		submit++;
 		trace_io_uring_submit_sqe(ctx, true, false);
 		io_submit_sqe(ctx, &s, statep, &link);
@@ -3714,6 +3774,53 @@ static int io_uring_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+				  struct files_struct *files)
+{
+	struct io_kiocb *req;
+	DEFINE_WAIT(wait);
+
+	while (!list_empty_careful(&ctx->inflight_list)) {
+		enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
+
+		spin_lock_irq(&ctx->inflight_lock);
+		list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
+			if (req->work.files == files) {
+				ret = io_wq_cancel_work(ctx->io_wq, &req->work);
+				break;
+			}
+		}
+		if (ret == IO_WQ_CANCEL_RUNNING)
+			prepare_to_wait(&ctx->inflight_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+
+		spin_unlock_irq(&ctx->inflight_lock);
+
+		/*
+		 * We need to keep going until we get NOTFOUND. We only cancel
+		 * one work at the time.
+		 *
+		 * If we get CANCEL_RUNNING, then wait for a work to complete
+		 * before continuing.
+		 */
+		if (ret == IO_WQ_CANCEL_OK)
+			continue;
+		else if (ret != IO_WQ_CANCEL_RUNNING)
+			break;
+		schedule();
+	}
+}
+
+static int io_uring_flush(struct file *file, void *data)
+{
+	struct io_ring_ctx *ctx = file->private_data;
+
+	io_uring_cancel_files(ctx, data);
+	if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
+		io_wq_cancel_all(ctx->io_wq);
+	return 0;
+}
+
 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
@@ -3782,7 +3889,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		to_submit = min(to_submit, ctx->sq_entries);
 
 		mutex_lock(&ctx->uring_lock);
-		submitted = io_ring_submit(ctx, to_submit);
+		submitted = io_ring_submit(ctx, to_submit, f.file, fd);
 		mutex_unlock(&ctx->uring_lock);
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
@@ -3805,6 +3912,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 
 static const struct file_operations io_uring_fops = {
 	.release	= io_uring_release,
+	.flush		= io_uring_flush,
 	.mmap		= io_uring_mmap,
 	.poll		= io_uring_poll,
 	.fasync		= io_uring_fasync,

From de2ea4b64b75a79ed9cdf9bf30e0e197901084e4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 17 Oct 2019 14:41:29 -0600
Subject: [PATCH 14/64] net: add __sys_accept4_file() helper

This is identical to __sys_accept4(), except it takes a struct file
instead of an fd, and it also allows passing in extra file->f_flags
flags. The latter is done to support masking in O_NONBLOCK without
manipulating the original file flags.

No functional changes in this patch.

Cc: netdev@vger.kernel.org
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/socket.h |  3 ++
 net/socket.c           | 65 ++++++++++++++++++++++++++----------------
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index fc0bed59fc84..dd061f741bc1 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -392,6 +392,9 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
 extern int __sys_sendto(int fd, void __user *buff, size_t len,
 			unsigned int flags, struct sockaddr __user *addr,
 			int addr_len);
+extern int __sys_accept4_file(struct file *file, unsigned file_flags,
+			struct sockaddr __user *upeer_sockaddr,
+			 int __user *upeer_addrlen, int flags);
 extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
 			 int __user *upeer_addrlen, int flags);
 extern int __sys_socket(int family, int type, int protocol);
diff --git a/net/socket.c b/net/socket.c
index 6a9ab7a8b1d2..40ab39f6c5d8 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1690,24 +1690,13 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
 	return __sys_listen(fd, backlog);
 }
 
-/*
- *	For accept, we attempt to create a new socket, set up the link
- *	with the client, wake up the client, then return the new
- *	connected fd. We collect the address of the connector in kernel
- *	space and move it to user at the very end. This is unclean because
- *	we open the socket then return an error.
- *
- *	1003.1g adds the ability to recvmsg() to query connection pending
- *	status to recvmsg. We need to add that support in a way thats
- *	clean when we restructure accept also.
- */
-
-int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
-		  int __user *upeer_addrlen, int flags)
+int __sys_accept4_file(struct file *file, unsigned file_flags,
+		       struct sockaddr __user *upeer_sockaddr,
+		       int __user *upeer_addrlen, int flags)
 {
 	struct socket *sock, *newsock;
 	struct file *newfile;
-	int err, len, newfd, fput_needed;
+	int err, len, newfd;
 	struct sockaddr_storage address;
 
 	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
@@ -1716,14 +1705,14 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
 	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
 		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	sock = sock_from_file(file, &err);
 	if (!sock)
 		goto out;
 
 	err = -ENFILE;
 	newsock = sock_alloc();
 	if (!newsock)
-		goto out_put;
+		goto out;
 
 	newsock->type = sock->type;
 	newsock->ops = sock->ops;
@@ -1738,20 +1727,21 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
 	if (unlikely(newfd < 0)) {
 		err = newfd;
 		sock_release(newsock);
-		goto out_put;
+		goto out;
 	}
 	newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
 	if (IS_ERR(newfile)) {
 		err = PTR_ERR(newfile);
 		put_unused_fd(newfd);
-		goto out_put;
+		goto out;
 	}
 
 	err = security_socket_accept(sock, newsock);
 	if (err)
 		goto out_fd;
 
-	err = sock->ops->accept(sock, newsock, sock->file->f_flags, false);
+	err = sock->ops->accept(sock, newsock, sock->file->f_flags | file_flags,
+					false);
 	if (err < 0)
 		goto out_fd;
 
@@ -1772,15 +1762,42 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
 
 	fd_install(newfd, newfile);
 	err = newfd;
-
-out_put:
-	fput_light(sock->file, fput_needed);
 out:
 	return err;
 out_fd:
 	fput(newfile);
 	put_unused_fd(newfd);
-	goto out_put;
+	goto out;
+
+}
+
+/*
+ *	For accept, we attempt to create a new socket, set up the link
+ *	with the client, wake up the client, then return the new
+ *	connected fd. We collect the address of the connector in kernel
+ *	space and move it to user at the very end. This is unclean because
+ *	we open the socket then return an error.
+ *
+ *	1003.1g adds the ability to recvmsg() to query connection pending
+ *	status to recvmsg. We need to add that support in a way thats
+ *	clean when we restructure accept also.
+ */
+
+int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
+		  int __user *upeer_addrlen, int flags)
+{
+	int ret = -EBADF;
+	struct fd f;
+
+	f = fdget(fd);
+	if (f.file) {
+		ret = __sys_accept4_file(f.file, 0, upeer_sockaddr,
+						upeer_addrlen, flags);
+		if (f.flags)
+			fput(f.file);
+	}
+
+	return ret;
 }
 
 SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,

From 17f2fe35d080d8f64e86a60cdcd3a97edcbc213b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 17 Oct 2019 14:42:58 -0600
Subject: [PATCH 15/64] io_uring: add support for IORING_OP_ACCEPT

This allows an application to call accept4() in an async fashion. Like
other opcodes, we first try a non-blocking accept, then punt to async
context if we have to.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 37 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  7 ++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6e1523567920..b668149c20b9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1686,6 +1686,40 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 #endif
 }
 
+static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		     struct io_kiocb **nxt, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	struct sockaddr __user *addr;
+	int __user *addr_len;
+	unsigned file_flags;
+	int flags, ret;
+
+	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+		return -EINVAL;
+	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+		return -EINVAL;
+
+	addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
+	addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2);
+	flags = READ_ONCE(sqe->accept_flags);
+	file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+	ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags);
+	if (ret == -EAGAIN && force_nonblock) {
+		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+		return -EAGAIN;
+	}
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_put_req(req, nxt);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static void io_poll_remove_one(struct io_kiocb *req)
 {
 	struct io_poll_iocb *poll = &req->poll;
@@ -2173,6 +2207,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_TIMEOUT_REMOVE:
 		ret = io_timeout_remove(req, s->sqe);
 		break;
+	case IORING_OP_ACCEPT:
+		ret = io_accept(req, s->sqe, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6dc5ced1c37a..f82d90e617a6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -19,7 +19,10 @@ struct io_uring_sqe {
 	__u8	flags;		/* IOSQE_ flags */
 	__u16	ioprio;		/* ioprio for the request */
 	__s32	fd;		/* file descriptor to do IO on */
-	__u64	off;		/* offset into file */
+	union {
+		__u64	off;	/* offset into file */
+		__u64	addr2;
+	};
 	__u64	addr;		/* pointer to buffer or iovecs */
 	__u32	len;		/* buffer size or number of iovecs */
 	union {
@@ -29,6 +32,7 @@ struct io_uring_sqe {
 		__u32		sync_range_flags;
 		__u32		msg_flags;
 		__u32		timeout_flags;
+		__u32		accept_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -65,6 +69,7 @@ struct io_uring_sqe {
 #define IORING_OP_RECVMSG	10
 #define IORING_OP_TIMEOUT	11
 #define IORING_OP_TIMEOUT_REMOVE	12
+#define IORING_OP_ACCEPT	13
 
 /*
  * sqe->fsync_flags

From b7620121dc04e44ce654297050f9eaf39d414a34 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 26 Oct 2019 07:22:55 -0600
Subject: [PATCH 16/64] io_uring: protect fixed file indexing with
 array_index_nospec()

We index the file tables with a user given value. After we check
it's within our limits, use array_index_nospec() to prevent any
spectre attacks here.

Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b668149c20b9..7743b180a3e0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2321,6 +2321,7 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 		if (unlikely(!ctx->user_files ||
 		    (unsigned) fd >= ctx->nr_user_files))
 			return -EBADF;
+		fd = array_index_nospec(fd, ctx->nr_user_files);
 		if (!ctx->user_files[fd])
 			return -EBADF;
 		req->file = ctx->user_files[fd];

From 65e19f54d29cd8559ce60cfd0d751bef7afbdc5c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 26 Oct 2019 07:20:21 -0600
Subject: [PATCH 17/64] io_uring: support for larger fixed file sets

There's been a few requests for supporting more fixed files than 1024.
This isn't really tricky to do, we just need to split up the file table
into multiple tables and index appropriately. As we do so, reduce the
max single file table to 512. This enables us to do single page allocs
always for the tables, which is an improvement over the situation prior.

This patch adds support for up to 64K files, which should be enough for
everyone.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 150 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 117 insertions(+), 33 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7743b180a3e0..281d0b7597cf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -80,7 +80,14 @@
 
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
-#define IORING_MAX_FIXED_FILES	1024
+
+/*
+ * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
+ */
+#define IORING_FILE_TABLE_SHIFT	9
+#define IORING_MAX_FILES_TABLE	(1U << IORING_FILE_TABLE_SHIFT)
+#define IORING_FILE_TABLE_MASK	(IORING_MAX_FILES_TABLE - 1)
+#define IORING_MAX_FIXED_FILES	(64 * IORING_MAX_FILES_TABLE)
 
 struct io_uring {
 	u32 head ____cacheline_aligned_in_smp;
@@ -165,6 +172,10 @@ struct io_mapped_ubuf {
 	unsigned int	nr_bvecs;
 };
 
+struct fixed_file_table {
+	struct file		**files;
+};
+
 struct io_ring_ctx {
 	struct {
 		struct percpu_ref	refs;
@@ -225,7 +236,7 @@ struct io_ring_ctx {
 	 * readers must ensure that ->refs is alive as long as the file* is
 	 * used. Only updated through io_uring_register(2).
 	 */
-	struct file		**user_files;
+	struct fixed_file_table	*file_table;
 	unsigned		nr_user_files;
 
 	/* if used, fixed mapped user buffers */
@@ -2296,6 +2307,15 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe)
 	}
 }
 
+static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
+					      int index)
+{
+	struct fixed_file_table *table;
+
+	table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
+	return table->files[index & IORING_FILE_TABLE_MASK];
+}
+
 static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 			   struct io_submit_state *state, struct io_kiocb *req)
 {
@@ -2318,13 +2338,13 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 		return 0;
 
 	if (flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files ||
+		if (unlikely(!ctx->file_table ||
 		    (unsigned) fd >= ctx->nr_user_files))
 			return -EBADF;
 		fd = array_index_nospec(fd, ctx->nr_user_files);
-		if (!ctx->user_files[fd])
+		req->file = io_file_from_index(ctx, fd);
+		if (!req->file)
 			return -EBADF;
-		req->file = ctx->user_files[fd];
 		req->flags |= REQ_F_FIXED_FILE;
 	} else {
 		if (s->needs_fixed_file)
@@ -2969,20 +2989,29 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 #else
 	int i;
 
-	for (i = 0; i < ctx->nr_user_files; i++)
-		if (ctx->user_files[i])
-			fput(ctx->user_files[i]);
+	for (i = 0; i < ctx->nr_user_files; i++) {
+		struct file *file;
+
+		file = io_file_from_index(ctx, i);
+		if (file)
+			fput(file);
+	}
 #endif
 }
 
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	if (!ctx->user_files)
+	unsigned nr_tables, i;
+
+	if (!ctx->file_table)
 		return -ENXIO;
 
 	__io_sqe_files_unregister(ctx);
-	kfree(ctx->user_files);
-	ctx->user_files = NULL;
+	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
+	for (i = 0; i < nr_tables; i++)
+		kfree(ctx->file_table[i].files);
+	kfree(ctx->file_table);
+	ctx->file_table = NULL;
 	ctx->nr_user_files = 0;
 	return 0;
 }
@@ -3057,9 +3086,11 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 	nr_files = 0;
 	fpl->user = get_uid(ctx->user);
 	for (i = 0; i < nr; i++) {
-		if (!ctx->user_files[i + offset])
+		struct file *file = io_file_from_index(ctx, i + offset);
+
+		if (!file)
 			continue;
-		fpl->fp[nr_files] = get_file(ctx->user_files[i + offset]);
+		fpl->fp[nr_files] = get_file(file);
 		unix_inflight(fpl->user, fpl->fp[nr_files]);
 		nr_files++;
 	}
@@ -3108,8 +3139,10 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 		return 0;
 
 	while (total < ctx->nr_user_files) {
-		if (ctx->user_files[total])
-			fput(ctx->user_files[total]);
+		struct file *file = io_file_from_index(ctx, total);
+
+		if (file)
+			fput(file);
 		total++;
 	}
 
@@ -3122,25 +3155,63 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 }
 #endif
 
+static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
+				    unsigned nr_files)
+{
+	int i;
+
+	for (i = 0; i < nr_tables; i++) {
+		struct fixed_file_table *table = &ctx->file_table[i];
+		unsigned this_files;
+
+		this_files = min(nr_files, IORING_MAX_FILES_TABLE);
+		table->files = kcalloc(this_files, sizeof(struct file *),
+					GFP_KERNEL);
+		if (!table->files)
+			break;
+		nr_files -= this_files;
+	}
+
+	if (i == nr_tables)
+		return 0;
+
+	for (i = 0; i < nr_tables; i++) {
+		struct fixed_file_table *table = &ctx->file_table[i];
+		kfree(table->files);
+	}
+	return 1;
+}
+
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args)
 {
 	__s32 __user *fds = (__s32 __user *) arg;
+	unsigned nr_tables;
 	int fd, ret = 0;
 	unsigned i;
 
-	if (ctx->user_files)
+	if (ctx->file_table)
 		return -EBUSY;
 	if (!nr_args)
 		return -EINVAL;
 	if (nr_args > IORING_MAX_FIXED_FILES)
 		return -EMFILE;
 
-	ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
-	if (!ctx->user_files)
+	nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
+	ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
+					GFP_KERNEL);
+	if (!ctx->file_table)
 		return -ENOMEM;
 
+	if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
+		kfree(ctx->file_table);
+		return -ENOMEM;
+	}
+
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+		struct fixed_file_table *table;
+		unsigned index;
+
 		ret = -EFAULT;
 		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
 			break;
@@ -3150,10 +3221,12 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			continue;
 		}
 
-		ctx->user_files[i] = fget(fd);
+		table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+		index = i & IORING_FILE_TABLE_MASK;
+		table->files[index] = fget(fd);
 
 		ret = -EBADF;
-		if (!ctx->user_files[i])
+		if (!table->files[index])
 			break;
 		/*
 		 * Don't allow io_uring instances to be registered. If UNIX
@@ -3162,20 +3235,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		 * handle it just fine, but there's still no point in allowing
 		 * a ring fd as it doesn't support regular read/write anyway.
 		 */
-		if (ctx->user_files[i]->f_op == &io_uring_fops) {
-			fput(ctx->user_files[i]);
+		if (table->files[index]->f_op == &io_uring_fops) {
+			fput(table->files[index]);
 			break;
 		}
 		ret = 0;
 	}
 
 	if (ret) {
-		for (i = 0; i < ctx->nr_user_files; i++)
-			if (ctx->user_files[i])
-				fput(ctx->user_files[i]);
+		for (i = 0; i < ctx->nr_user_files; i++) {
+			struct file *file;
 
-		kfree(ctx->user_files);
-		ctx->user_files = NULL;
+			file = io_file_from_index(ctx, i);
+			if (file)
+				fput(file);
+		}
+		for (i = 0; i < nr_tables; i++)
+			kfree(ctx->file_table[i].files);
+
+		kfree(ctx->file_table);
+		ctx->file_table = NULL;
 		ctx->nr_user_files = 0;
 		return ret;
 	}
@@ -3190,7 +3269,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
 {
 #if defined(CONFIG_UNIX)
-	struct file *file = ctx->user_files[index];
+	struct file *file = io_file_from_index(ctx, index);
 	struct sock *sock = ctx->ring_sock->sk;
 	struct sk_buff_head list, *head = &sock->sk_receive_queue;
 	struct sk_buff *skb;
@@ -3246,7 +3325,7 @@ static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
 		spin_unlock_irq(&head->lock);
 	}
 #else
-	fput(ctx->user_files[index]);
+	fput(io_file_from_index(ctx, index));
 #endif
 }
 
@@ -3301,7 +3380,7 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 	int fd, i, err;
 	__u32 done;
 
-	if (!ctx->user_files)
+	if (!ctx->file_table)
 		return -ENXIO;
 	if (!nr_args)
 		return -EINVAL;
@@ -3315,15 +3394,20 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 	done = 0;
 	fds = (__s32 __user *) up.fds;
 	while (nr_args) {
+		struct fixed_file_table *table;
+		unsigned index;
+
 		err = 0;
 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
 			err = -EFAULT;
 			break;
 		}
 		i = array_index_nospec(up.offset, ctx->nr_user_files);
-		if (ctx->user_files[i]) {
+		table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+		index = i & IORING_FILE_TABLE_MASK;
+		if (table->files[index]) {
 			io_sqe_file_unregister(ctx, i);
-			ctx->user_files[i] = NULL;
+			table->files[index] = NULL;
 		}
 		if (fd != -1) {
 			struct file *file;
@@ -3346,7 +3430,7 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 				err = -EBADF;
 				break;
 			}
-			ctx->user_files[i] = file;
+			table->files[index] = file;
 			err = io_sqe_file_register(ctx, file, i);
 			if (err)
 				break;

From 842f96124c5617b060cc0f071dcfb6ab24bdd042 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 29 Oct 2019 12:34:10 -0600
Subject: [PATCH 18/64] io_uring: fix race with canceling timeouts

If we get -1 from hrtimer_try_to_cancel(), we know that the timer
is running. Hence leave all completion to the timeout handler. If
we don't, we can corrupt the list and miss a completion.

Fixes: 11365043e527 ("io_uring: add support for canceling timeout requests")
Reported-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Tested-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 281d0b7597cf..8e25c25c7309 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -533,7 +533,7 @@ static void io_kill_timeout(struct io_kiocb *req)
 	ret = hrtimer_try_to_cancel(&req->timeout.timer);
 	if (ret != -1) {
 		atomic_inc(&req->ctx->cq_timeouts);
-		list_del(&req->list);
+		list_del_init(&req->list);
 		io_cqring_fill_event(req->ctx, req->user_data, 0);
 		__io_free_req(req);
 	}
@@ -1957,7 +1957,6 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	struct io_ring_ctx *ctx;
 	struct io_kiocb *req;
 	unsigned long flags;
-	bool comp;
 
 	req = container_of(timer, struct io_kiocb, timeout.timer);
 	ctx = req->ctx;
@@ -1968,8 +1967,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	 * We could be racing with timeout deletion. If the list is empty,
 	 * then timeout lookup already found it and will be handling it.
 	 */
-	comp = !list_empty(&req->list);
-	if (comp) {
+	if (!list_empty(&req->list)) {
 		struct io_kiocb *prev;
 
 		/*
@@ -1981,17 +1979,15 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 		prev = req;
 		list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
 			prev->sequence++;
-
 		list_del_init(&req->list);
-		io_cqring_fill_event(ctx, req->user_data, -ETIME);
-		io_commit_cqring(ctx);
 	}
+
+	io_cqring_fill_event(ctx, req->user_data, -ETIME);
+	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-	if (comp) {
-		io_cqring_ev_posted(ctx);
-		io_put_req(req, NULL);
-	}
+	io_cqring_ev_posted(ctx);
+	io_put_req(req, NULL);
 	return HRTIMER_NORESTART;
 }
 
@@ -2131,9 +2127,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 	req->sequence -= span;
 	list_add(&req->list, entry);
-	spin_unlock_irq(&ctx->completion_lock);
 	req->timeout.timer.function = io_timeout_fn;
 	hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode);
+	spin_unlock_irq(&ctx->completion_lock);
 	return 0;
 }
 

From 975c99a570967dd48e917dd7853867fee3febabd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 30 Oct 2019 08:42:56 -0600
Subject: [PATCH 19/64] io_uring: io_wq_create() returns an error pointer, not
 NULL

syzbot reported an issue where we crash at setup time if failslab is
used. The issue is that io_wq_create() returns an error pointer on
failure, not NULL. Hence io_uring thought the io-wq was setup just
fine, but in reality it's a garbage error pointer.

Use IS_ERR() instead of a NULL check, and assign ret appropriately.

Reported-by: syzbot+221cc24572a2fed23b6b@syzkaller.appspotmail.com
Fixes: 561fb04a6a22 ("io_uring: replace workqueue usage with io-wq")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8e25c25c7309..72d260520c8f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3489,8 +3489,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 	/* Do QD, or 4 * CPUS, whatever is smallest */
 	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
 	ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm);
-	if (!ctx->io_wq) {
-		ret = -ENOMEM;
+	if (IS_ERR(ctx->io_wq)) {
+		ret = PTR_ERR(ctx->io_wq);
+		ctx->io_wq = NULL;
 		goto err;
 	}
 

From 62755e35dfb2b113c52b81cd96d01c20971c8e02 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 28 Oct 2019 21:49:21 -0600
Subject: [PATCH 20/64] io_uring: support for generic async request cancel

This adds support for IORING_OP_ASYNC_CANCEL, which will attempt to
cancel requests that have been punted to async context and are now
in-flight. This works for regular read/write requests to files, as
long as they haven't been started yet. For socket based IO (or things
like accept4(2)), we can cancel work that is already running as well.

To cancel a request, the sqe must have ->addr set to the user_data of
the request it wishes to cancel. If the request is cancelled
successfully, the original request is completed with -ECANCELED
and the cancel request is completed with a result of 0. If the
request was already running, the original may or may not complete
in error. The cancel request will complete with -EALREADY for that
case. And finally, if the request to cancel wasn't found, the cancel
request is completed with -ENOENT.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c                    | 85 +++++++++++++++++++++++++++++++++++
 fs/io-wq.h                    |  5 +++
 fs/io_uring.c                 | 45 +++++++++++++++++++
 include/uapi/linux/io_uring.h |  2 +
 4 files changed, 137 insertions(+)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 253c04a40db5..652b8bac2dbc 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -639,6 +639,91 @@ void io_wq_cancel_all(struct io_wq *wq)
 	rcu_read_unlock();
 }
 
+struct io_cb_cancel_data {
+	struct io_wqe *wqe;
+	work_cancel_fn *cancel;
+	void *caller_data;
+};
+
+static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
+{
+	struct io_cb_cancel_data *data = cancel_data;
+	struct io_wqe *wqe = data->wqe;
+	bool ret = false;
+
+	/*
+	 * Hold the lock to avoid ->cur_work going out of scope, caller
+	 * may deference the passed in work.
+	 */
+	spin_lock_irq(&wqe->lock);
+	if (worker->cur_work &&
+	    data->cancel(worker->cur_work, data->caller_data)) {
+		send_sig(SIGINT, worker->task, 1);
+		ret = true;
+	}
+	spin_unlock_irq(&wqe->lock);
+
+	return ret;
+}
+
+static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
+					       work_cancel_fn *cancel,
+					       void *cancel_data)
+{
+	struct io_cb_cancel_data data = {
+		.wqe = wqe,
+		.cancel = cancel,
+		.caller_data = cancel_data,
+	};
+	struct io_wq_work *work;
+	bool found = false;
+
+	spin_lock_irq(&wqe->lock);
+	list_for_each_entry(work, &wqe->work_list, list) {
+		if (cancel(work, cancel_data)) {
+			list_del(&work->list);
+			found = true;
+			break;
+		}
+	}
+	spin_unlock_irq(&wqe->lock);
+
+	if (found) {
+		work->flags |= IO_WQ_WORK_CANCEL;
+		work->func(&work);
+		return IO_WQ_CANCEL_OK;
+	}
+
+	rcu_read_lock();
+	found = io_wq_for_each_worker(wqe, &wqe->free_list, io_work_cancel,
+					&data);
+	if (found)
+		goto done;
+
+	found = io_wq_for_each_worker(wqe, &wqe->busy_list, io_work_cancel,
+					&data);
+done:
+	rcu_read_unlock();
+	return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
+}
+
+enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
+				  void *data)
+{
+	enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
+	int i;
+
+	for (i = 0; i < wq->nr_wqes; i++) {
+		struct io_wqe *wqe = wq->wqes[i];
+
+		ret = io_wqe_cancel_cb_work(wqe, cancel, data);
+		if (ret != IO_WQ_CANCEL_NOTFOUND)
+			break;
+	}
+
+	return ret;
+}
+
 static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 {
 	struct io_wq_work *work = data;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index e93f764b1fa4..3de192dc73fc 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -43,6 +43,11 @@ void io_wq_flush(struct io_wq *wq);
 void io_wq_cancel_all(struct io_wq *wq);
 enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
 
+typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
+
+enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
+					void *data);
+
 #if defined(CONFIG_IO_WQ)
 extern void io_wq_worker_sleeping(struct task_struct *);
 extern void io_wq_worker_running(struct task_struct *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 72d260520c8f..76d653085987 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2133,6 +2133,48 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
+static bool io_cancel_cb(struct io_wq_work *work, void *data)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+	return req->user_data == (unsigned long) data;
+}
+
+static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   struct io_kiocb **nxt)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	enum io_wq_cancel cancel_ret;
+	void *sqe_addr;
+	int ret = 0;
+
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
+	    sqe->cancel_flags)
+		return -EINVAL;
+
+	sqe_addr = (void *) (unsigned long) READ_ONCE(sqe->addr);
+	cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
+	switch (cancel_ret) {
+	case IO_WQ_CANCEL_OK:
+		ret = 0;
+		break;
+	case IO_WQ_CANCEL_RUNNING:
+		ret = -EALREADY;
+		break;
+	case IO_WQ_CANCEL_NOTFOUND:
+		ret = -ENOENT;
+		break;
+	}
+
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_put_req(req, nxt);
+	return 0;
+}
+
 static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			const struct io_uring_sqe *sqe)
 {
@@ -2217,6 +2259,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_ACCEPT:
 		ret = io_accept(req, s->sqe, nxt, force_nonblock);
 		break;
+	case IORING_OP_ASYNC_CANCEL:
+		ret = io_async_cancel(req, s->sqe, nxt);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f82d90e617a6..6877cf8894db 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -33,6 +33,7 @@ struct io_uring_sqe {
 		__u32		msg_flags;
 		__u32		timeout_flags;
 		__u32		accept_flags;
+		__u32		cancel_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -70,6 +71,7 @@ struct io_uring_sqe {
 #define IORING_OP_TIMEOUT	11
 #define IORING_OP_TIMEOUT_REMOVE	12
 #define IORING_OP_ACCEPT	13
+#define IORING_OP_ASYNC_CANCEL	14
 
 /*
  * sqe->fsync_flags

From e9ffa5c2b77edf2689f876b640318b16fc3ea2a7 Mon Sep 17 00:00:00 2001
From: Jackie Liu <liuyun01@kylinos.cn>
Date: Tue, 29 Oct 2019 11:16:42 +0800
Subject: [PATCH 21/64] io_uring: set -EINTR directly when a signal wakes up in
 io_cqring_wait

We didn't use -ERESTARTSYS to tell the application layer to restart the
system call, but instead return -EINTR. we can set -EINTR directly when
wakeup by the signal, which can help us save an assignment operation and
comparison operation.

Reviewed-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 76d653085987..a520c4262d85 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2976,7 +2976,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
-	int ret;
+	int ret = 0;
 
 	if (io_cqring_events(rings) >= min_events)
 		return 0;
@@ -2994,7 +2994,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	ret = 0;
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
@@ -3004,15 +3003,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			break;
 		schedule();
 		if (signal_pending(current)) {
-			ret = -ERESTARTSYS;
+			ret = -EINTR;
 			break;
 		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);
 
-	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
-	if (ret == -ERESTARTSYS)
-		ret = -EINTR;
+	restore_saved_sigmask_unless(ret == -EINTR);
 
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }

From 0069fc6b1cf28de3a3890ed7c87a5b8ab79ca528 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 1 Nov 2019 12:44:40 -0600
Subject: [PATCH 22/64] io_uring: remove io_uring_add_to_prev() trace event

This internal logic was killed with the conversion to io-wq, so we no
longer have a need for this particular trace. Kill it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/io_uring.h | 29 -----------------------------
 1 file changed, 29 deletions(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index b85255121b98..8f21d8bf20fd 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -227,35 +227,6 @@ TRACE_EVENT(io_uring_link,
 			  __entry->ctx, __entry->req, __entry->target_req)
 );
 
-/**
- * io_uring_add_to_prev - called after a request was added into a previously
- * 						  submitted work
- *
- * @req:	pointer to a request, added to a previous
- * @ret:	whether or not it was completed successfully
- *
- * Allows to track merged work, to figure out how often requests are piggy
- * backed into other ones, changing the execution flow.
- */
-TRACE_EVENT(io_uring_add_to_prev,
-
-	TP_PROTO(void *req, bool ret),
-
-	TP_ARGS(req, ret),
-
-	TP_STRUCT__entry (
-		__field(  void *,	req	)
-		__field(  bool,		ret	)
-	),
-
-	TP_fast_assign(
-		__entry->req	= req;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("request %p, ret %d", __entry->req, __entry->ret)
-);
-
 /**
  * io_uring_cqring_wait - called before start waiting for an available CQE
  *

From 364b05fd06e87e53dc03396f73afeac48d8e0998 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 2 Nov 2019 15:55:01 +0800
Subject: [PATCH 23/64] io-wq: use kfree_rcu() to simplify the code

The callback function of call_rcu() just calls kfree(), so we can use
kfree_rcu() instead of call_rcu() + callback function.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 652b8bac2dbc..3bbab2c58695 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -102,13 +102,6 @@ struct io_wq {
 	struct completion done;
 };
 
-static void io_wq_free_worker(struct rcu_head *head)
-{
-	struct io_worker *worker = container_of(head, struct io_worker, rcu);
-
-	kfree(worker);
-}
-
 static bool io_worker_get(struct io_worker *worker)
 {
 	return refcount_inc_not_zero(&worker->ref);
@@ -194,7 +187,7 @@ static void io_worker_exit(struct io_worker *worker)
 	if (all_done && refcount_dec_and_test(&wqe->wq->refs))
 		complete(&wqe->wq->done);
 
-	call_rcu(&worker->rcu, io_wq_free_worker);
+	kfree_rcu(worker, rcu);
 }
 
 static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)

From 51c3ff62cac635ae9d75f875ce5b7bdafc97abd5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 3 Nov 2019 06:52:50 -0700
Subject: [PATCH 24/64] io_uring: add completion trace event

We currently don't have a completion event trace, add one of those. And
to better be able to match up submissions and completions, add user_data
to the submission trace as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                   |  6 ++--
 include/trace/events/io_uring.h | 54 +++++++++++++++++++++++++++------
 2 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a520c4262d85..7813bc7d5b61 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -592,6 +592,8 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 {
 	struct io_uring_cqe *cqe;
 
+	trace_io_uring_complete(ctx, ki_user_data, res);
+
 	/*
 	 * If we can't get a cq entry, userspace overflowed the
 	 * submission (by quite a lot). Increment the overflow count in
@@ -2733,7 +2735,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		s.has_user = *mm != NULL;
 		s.in_async = true;
 		s.needs_fixed_file = true;
-		trace_io_uring_submit_sqe(ctx, true, true);
+		trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, true);
 		io_submit_sqe(ctx, &s, statep, &link);
 		submitted++;
 	}
@@ -2913,7 +2915,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 		s.needs_fixed_file = false;
 		s.ring_fd = ring_fd;
 		submit++;
-		trace_io_uring_submit_sqe(ctx, true, false);
+		trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, false);
 		io_submit_sqe(ctx, &s, statep, &link);
 	}
 
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 8f21d8bf20fd..72a4d0174b02 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -284,10 +284,43 @@ TRACE_EVENT(io_uring_fail_link,
 	TP_printk("request %p, link %p", __entry->req, __entry->link)
 );
 
+/**
+ * io_uring_complete - called when completing an SQE
+ *
+ * @ctx:		pointer to a ring context structure
+ * @user_data:		user data associated with the request
+ * @res:		result of the request
+ *
+ */
+TRACE_EVENT(io_uring_complete,
+
+	TP_PROTO(void *ctx, u64 user_data, long res),
+
+	TP_ARGS(ctx, user_data, res),
+
+	TP_STRUCT__entry (
+		__field(  void *,	ctx		)
+		__field(  u64,		user_data	)
+		__field(  long,		res		)
+	),
+
+	TP_fast_assign(
+		__entry->ctx		= ctx;
+		__entry->user_data	= user_data;
+		__entry->res		= res;
+	),
+
+	TP_printk("ring %p, user_data 0x%llx, result %ld",
+			  __entry->ctx, (unsigned long long)__entry->user_data,
+			  __entry->res)
+);
+
+
 /**
  * io_uring_submit_sqe - called before submitting one SQE
  *
- * @ctx:			pointer to a ring context structure
+ * @ctx:		pointer to a ring context structure
+ * @user_data:		user data associated with the request
  * @force_nonblock:	whether a context blocking or not
  * @sq_thread:		true if sq_thread has submitted this SQE
  *
@@ -296,24 +329,27 @@ TRACE_EVENT(io_uring_fail_link,
  */
 TRACE_EVENT(io_uring_submit_sqe,
 
-	TP_PROTO(void *ctx, bool force_nonblock, bool sq_thread),
+	TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
 
-	TP_ARGS(ctx, force_nonblock, sq_thread),
+	TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
 
 	TP_STRUCT__entry (
-		__field(  void *,	ctx				)
+		__field(  void *,	ctx		)
+		__field(  u64,		user_data	)
 		__field(  bool,		force_nonblock	)
-		__field(  bool,		sq_thread		)
+		__field(  bool,		sq_thread	)
 	),
 
 	TP_fast_assign(
-		__entry->ctx			= ctx;
+		__entry->ctx		= ctx;
+		__entry->user_data	= user_data;
 		__entry->force_nonblock	= force_nonblock;
-		__entry->sq_thread		= sq_thread;
+		__entry->sq_thread	= sq_thread;
 	),
 
-	TP_printk("ring %p, non block %d, sq_thread %d",
-			  __entry->ctx, __entry->force_nonblock, __entry->sq_thread)
+	TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
+			  __entry->ctx, (unsigned long long) __entry->user_data,
+			  __entry->force_nonblock, __entry->sq_thread)
 );
 
 #endif /* _TRACE_IO_URING_H */

From 1056ef940380c4e32349ccb6d956858edf70520c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 4 Nov 2019 08:50:02 -0700
Subject: [PATCH 25/64] MAINTAINERS: update io_uring entry

We now have a list that's appropriate for both kernel and userspace
discussions on io_uring usage and development, add that to the
MAINTAINERS entry.

Also add the io-wq files.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c6c34d04ce95..7afb25707098 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8562,12 +8562,13 @@ F:	include/linux/iova.h
 
 IO_URING
 M:	Jens Axboe <axboe@kernel.dk>
-L:	linux-block@vger.kernel.org
-L:	linux-fsdevel@vger.kernel.org
+L:	io-uring@vger.kernel.org
 T:	git git://git.kernel.dk/linux-block
 T:	git git://git.kernel.dk/liburing
 S:	Maintained
 F:	fs/io_uring.c
+F:	fs/io-wq.c
+F:	fs/io-wq.h
 F:	include/uapi/linux/io_uring.h
 
 IPMI SUBSYSTEM

From 6f72653e76a511db47addad6ab690390233fc250 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 5 Nov 2019 13:51:51 -0700
Subject: [PATCH 26/64] io-wq: use proper nesting IRQ disabling spinlocks for
 cancel

We don't know what context we'll be called in for cancel, it could very
well be with IRQs disabled already. Use the IRQ saving variants of the
locking primitives.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 3bbab2c58695..ba40a7ee31c3 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -642,19 +642,20 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
 {
 	struct io_cb_cancel_data *data = cancel_data;
 	struct io_wqe *wqe = data->wqe;
+	unsigned long flags;
 	bool ret = false;
 
 	/*
 	 * Hold the lock to avoid ->cur_work going out of scope, caller
 	 * may deference the passed in work.
 	 */
-	spin_lock_irq(&wqe->lock);
+	spin_lock_irqsave(&wqe->lock, flags);
 	if (worker->cur_work &&
 	    data->cancel(worker->cur_work, data->caller_data)) {
 		send_sig(SIGINT, worker->task, 1);
 		ret = true;
 	}
-	spin_unlock_irq(&wqe->lock);
+	spin_unlock_irqrestore(&wqe->lock, flags);
 
 	return ret;
 }
@@ -669,9 +670,10 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
 		.caller_data = cancel_data,
 	};
 	struct io_wq_work *work;
+	unsigned long flags;
 	bool found = false;
 
-	spin_lock_irq(&wqe->lock);
+	spin_lock_irqsave(&wqe->lock, flags);
 	list_for_each_entry(work, &wqe->work_list, list) {
 		if (cancel(work, cancel_data)) {
 			list_del(&work->list);
@@ -679,7 +681,7 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
 			break;
 		}
 	}
-	spin_unlock_irq(&wqe->lock);
+	spin_unlock_irqrestore(&wqe->lock, flags);
 
 	if (found) {
 		work->flags |= IO_WQ_WORK_CANCEL;
@@ -733,6 +735,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 					    struct io_wq_work *cwork)
 {
 	struct io_wq_work *work;
+	unsigned long flags;
 	bool found = false;
 
 	cwork->flags |= IO_WQ_WORK_CANCEL;
@@ -742,7 +745,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 	 * from there. CANCEL_OK means that the work is returned as-new,
 	 * no completion will be posted for it.
 	 */
-	spin_lock_irq(&wqe->lock);
+	spin_lock_irqsave(&wqe->lock, flags);
 	list_for_each_entry(work, &wqe->work_list, list) {
 		if (work == cwork) {
 			list_del(&work->list);
@@ -750,7 +753,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 			break;
 		}
 	}
-	spin_unlock_irq(&wqe->lock);
+	spin_unlock_irqrestore(&wqe->lock, flags);
 
 	if (found) {
 		work->flags |= IO_WQ_WORK_CANCEL;

From 89723d0bd6c77540c01ce7db2cd6f8c3be2fd958 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 5 Nov 2019 15:32:58 -0700
Subject: [PATCH 27/64] io_uring: enable optimized link handling for
 IORING_OP_POLL_ADD

As introduced by commit:

ba816ad61fdf ("io_uring: run dependent links inline if possible")

enable inline dependent link running for poll commands.
io_poll_complete_work() is the most important change, as it allows a
linked sequence of { POLL, READ } (for example) to proceed inline
instead of needing to get punted to another async context. The
submission side only potentially matters for sqthread, but may as well
include that bit.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7813bc7d5b61..bda27b52fd5b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1806,6 +1806,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 	struct io_poll_iocb *poll = &req->poll;
 	struct poll_table_struct pt = { ._key = poll->events };
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *nxt = NULL;
 	__poll_t mask = 0;
 
 	if (work->flags & IO_WQ_WORK_CANCEL)
@@ -1832,7 +1833,10 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_ev_posted(ctx);
-	io_put_req(req, NULL);
+
+	io_put_req(req, &nxt);
+	if (nxt)
+		*workptr = &nxt->work;
 }
 
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1886,7 +1890,8 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 	add_wait_queue(head, &pt->req->poll.wait);
 }
 
-static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		       struct io_kiocb **nxt)
 {
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1949,7 +1954,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	if (mask) {
 		io_cqring_ev_posted(ctx);
-		io_put_req(req, NULL);
+		io_put_req(req, nxt);
 	}
 	return ipt.error;
 }
@@ -2238,7 +2243,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		ret = io_fsync(req, s->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_POLL_ADD:
-		ret = io_poll_add(req, s->sqe);
+		ret = io_poll_add(req, s->sqe, nxt);
 		break;
 	case IORING_OP_POLL_REMOVE:
 		ret = io_poll_remove(req, s->sqe);

From f1f40853c01b5ccd0a1a29ce0b515c6f5405a798 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 5 Nov 2019 20:33:16 -0700
Subject: [PATCH 28/64] io_uring: fixup a few spots where link failure isn't
 flagged

If a request fails, we need to ensure we set REQ_F_FAIL_LINK on it if
REQ_F_LINK is set. Any failure in the chain should break the chain.

We were missing a few spots where this should be done. It might be nice
to generalize this somewhat at some point, as long as we factor in the
fact that failure looks different for each request type.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bda27b52fd5b..4edc94aab17e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1672,6 +1672,8 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req(req, nxt);
 	return 0;
 }
@@ -1787,6 +1789,8 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req(req, NULL);
 	return 0;
 }
@@ -1994,6 +1998,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	io_cqring_ev_posted(ctx);
+	if (req->flags & REQ_F_LINK)
+		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req(req, NULL);
 	return HRTIMER_NORESTART;
 }
@@ -2035,6 +2041,8 @@ static int io_timeout_remove(struct io_kiocb *req,
 		io_commit_cqring(ctx);
 		spin_unlock_irq(&ctx->completion_lock);
 		io_cqring_ev_posted(ctx);
+		if (req->flags & REQ_F_LINK)
+			req->flags |= REQ_F_FAIL_LINK;
 		io_put_req(req, NULL);
 		return 0;
 	}
@@ -2328,6 +2336,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 	io_put_req(req, NULL);
 
 	if (ret) {
+		if (req->flags & REQ_F_LINK)
+			req->flags |= REQ_F_FAIL_LINK;
 		io_cqring_add_event(ctx, sqe->user_data, ret);
 		io_put_req(req, NULL);
 	}

From 3aa5fa030558e2b0da284fd069aeb7178543c987 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 5 Nov 2019 20:34:32 -0700
Subject: [PATCH 29/64] io_uring: kill dead REQ_F_LINK_DONE flag

We had no more use for this flag after the conversion to io-wq, kill it
off.

Fixes: 561fb04a6a22 ("io_uring: replace workqueue usage with io-wq")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4edc94aab17e..cfdb51dd669e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -329,7 +329,6 @@ struct io_kiocb {
 #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
 #define REQ_F_IO_DRAINED	32	/* drain done */
 #define REQ_F_LINK		64	/* linked sqes */
-#define REQ_F_LINK_DONE		128	/* linked sqes done */
 #define REQ_F_FAIL_LINK		256	/* fail rest of links */
 #define REQ_F_SHADOW_DRAIN	512	/* link-drain shadow req */
 #define REQ_F_TIMEOUT		1024	/* timeout request */
@@ -731,7 +730,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 			nxt->flags |= REQ_F_LINK;
 		}
 
-		nxt->flags |= REQ_F_LINK_DONE;
 		/*
 		 * If we're in async work, we can continue processing the chain
 		 * in this context instead of having to queue up new async work.

From ae9428ca61271b6b7f52ebbc359676c9fdfde523 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 6 Nov 2019 00:22:14 +0300
Subject: [PATCH 30/64] io_uring: Merge io_submit_sqes and io_ring_submit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

io_submit_sqes() and io_ring_submit() are doing the same stuff with
a little difference. Deduplicate them.

Reviewed-by：Bob Liu <bob.liu@oracle.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 91 ++++++++++-----------------------------------------
 1 file changed, 18 insertions(+), 73 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cfdb51dd669e..375c09a43d32 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2694,7 +2694,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 }
 
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
-			  struct mm_struct **mm)
+			  struct file *ring_file, int ring_fd,
+			  struct mm_struct **mm, bool async)
 {
 	struct io_submit_state state, *statep = NULL;
 	struct io_kiocb *link = NULL;
@@ -2745,10 +2746,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		}
 
 out:
+		s.ring_file = ring_file;
+		s.ring_fd = ring_fd;
 		s.has_user = *mm != NULL;
-		s.in_async = true;
-		s.needs_fixed_file = true;
-		trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, true);
+		s.in_async = async;
+		s.needs_fixed_file = async;
+		trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, async);
 		io_submit_sqe(ctx, &s, statep, &link);
 		submitted++;
 	}
@@ -2758,6 +2761,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	if (statep)
 		io_submit_state_end(&state);
 
+	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
+	io_commit_sqring(ctx);
+
 	return submitted;
 }
 
@@ -2862,10 +2868,8 @@ static int io_sq_thread(void *data)
 		}
 
 		to_submit = min(to_submit, ctx->sq_entries);
-		inflight += io_submit_sqes(ctx, to_submit, &cur_mm);
-
-		/* Commit SQ ring head once we've consumed all SQEs */
-		io_commit_sqring(ctx);
+		inflight += io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm,
+					   true);
 	}
 
 	set_fs(old_fs);
@@ -2879,69 +2883,6 @@ static int io_sq_thread(void *data)
 	return 0;
 }
 
-static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
-			  struct file *ring_file, int ring_fd)
-{
-	struct io_submit_state state, *statep = NULL;
-	struct io_kiocb *link = NULL;
-	struct io_kiocb *shadow_req = NULL;
-	bool prev_was_link = false;
-	int i, submit = 0;
-
-	if (to_submit > IO_PLUG_THRESHOLD) {
-		io_submit_state_start(&state, ctx, to_submit);
-		statep = &state;
-	}
-
-	for (i = 0; i < to_submit; i++) {
-		struct sqe_submit s;
-
-		if (!io_get_sqring(ctx, &s))
-			break;
-
-		/*
-		 * If previous wasn't linked and we have a linked command,
-		 * that's the end of the chain. Submit the previous link.
-		 */
-		if (!prev_was_link && link) {
-			io_queue_link_head(ctx, link, &link->submit, shadow_req);
-			link = NULL;
-			shadow_req = NULL;
-		}
-		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
-
-		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
-			if (!shadow_req) {
-				shadow_req = io_get_req(ctx, NULL);
-				if (unlikely(!shadow_req))
-					goto out;
-				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
-				refcount_dec(&shadow_req->refs);
-			}
-			shadow_req->sequence = s.sequence;
-		}
-
-out:
-		s.ring_file = ring_file;
-		s.has_user = true;
-		s.in_async = false;
-		s.needs_fixed_file = false;
-		s.ring_fd = ring_fd;
-		submit++;
-		trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, false);
-		io_submit_sqe(ctx, &s, statep, &link);
-	}
-
-	if (link)
-		io_queue_link_head(ctx, link, &link->submit, shadow_req);
-	if (statep)
-		io_submit_state_end(statep);
-
-	io_commit_sqring(ctx);
-
-	return submit;
-}
-
 struct io_wait_queue {
 	struct wait_queue_entry wq;
 	struct io_ring_ctx *ctx;
@@ -4062,10 +4003,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			wake_up(&ctx->sqo_wait);
 		submitted = to_submit;
 	} else if (to_submit) {
-		to_submit = min(to_submit, ctx->sq_entries);
+		struct mm_struct *cur_mm;
 
+		to_submit = min(to_submit, ctx->sq_entries);
 		mutex_lock(&ctx->uring_lock);
-		submitted = io_ring_submit(ctx, to_submit, f.file, fd);
+		/* already have mm, so io_submit_sqes() won't try to grab it */
+		cur_mm = ctx->sqo_mm;
+		submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
+					   &cur_mm, false);
 		mutex_unlock(&ctx->uring_lock);
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {

From e5eb6366ac2d1df8ad5b010718ac1997ceae45be Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 6 Nov 2019 00:22:15 +0300
Subject: [PATCH 31/64] io_uring: io_queue_link*() right after submit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After a call to io_submit_sqe(), it's already known whether it needs
to queue a link or not. Do it there, as it's simplier and doesn't keep
an extra variable across the loop.

Reviewed-by：Bob Liu <bob.liu@oracle.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 375c09a43d32..6524898831e0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2700,7 +2700,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	struct io_submit_state state, *statep = NULL;
 	struct io_kiocb *link = NULL;
 	struct io_kiocb *shadow_req = NULL;
-	bool prev_was_link = false;
 	int i, submitted = 0;
 	bool mm_fault = false;
 
@@ -2723,17 +2722,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			}
 		}
 
-		/*
-		 * If previous wasn't linked and we have a linked command,
-		 * that's the end of the chain. Submit the previous link.
-		 */
-		if (!prev_was_link && link) {
-			io_queue_link_head(ctx, link, &link->submit, shadow_req);
-			link = NULL;
-			shadow_req = NULL;
-		}
-		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
-
 		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
 			if (!shadow_req) {
 				shadow_req = io_get_req(ctx, NULL);
@@ -2754,6 +2742,16 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, async);
 		io_submit_sqe(ctx, &s, statep, &link);
 		submitted++;
+
+		/*
+		 * If previous wasn't linked and we have a linked command,
+		 * that's the end of the chain. Submit the previous link.
+		 */
+		if (!(s.sqe->flags & IOSQE_IO_LINK) && link) {
+			io_queue_link_head(ctx, link, &link->submit, shadow_req);
+			link = NULL;
+			shadow_req = NULL;
+		}
 	}
 
 	if (link)

From 196be95cd5572078be9deb81cbea145fab246029 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Nov 2019 01:41:06 +0300
Subject: [PATCH 32/64] io_uring: allocate io_kiocb upfront

Let io_submit_sqes() to allocate io_kiocb before fetching an sqe.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6524898831e0..d10216dd02bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2551,30 +2551,23 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
 
-static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
-			  struct io_submit_state *state, struct io_kiocb **link)
+static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			  struct sqe_submit *s, struct io_submit_state *state,
+			  struct io_kiocb **link)
 {
 	struct io_uring_sqe *sqe_copy;
-	struct io_kiocb *req;
 	int ret;
 
 	/* enforce forwards compatibility on users */
 	if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
 		ret = -EINVAL;
-		goto err;
-	}
-
-	req = io_get_req(ctx, state);
-	if (unlikely(!req)) {
-		ret = -EAGAIN;
-		goto err;
+		goto err_req;
 	}
 
 	ret = io_req_set_file(ctx, s, state, req);
 	if (unlikely(ret)) {
 err_req:
 		io_free_req(req, NULL);
-err:
 		io_cqring_add_event(ctx, s->sqe->user_data, ret);
 		return;
 	}
@@ -2710,9 +2703,18 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 
 	for (i = 0; i < nr; i++) {
 		struct sqe_submit s;
+		struct io_kiocb *req;
 
-		if (!io_get_sqring(ctx, &s))
+		req = io_get_req(ctx, statep);
+		if (unlikely(!req)) {
+			if (!submitted)
+				submitted = -EAGAIN;
 			break;
+		}
+		if (!io_get_sqring(ctx, &s)) {
+			__io_free_req(req);
+			break;
+		}
 
 		if (io_sqe_needs_user(s.sqe) && !*mm) {
 			mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
@@ -2740,7 +2742,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		s.in_async = async;
 		s.needs_fixed_file = async;
 		trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, async);
-		io_submit_sqe(ctx, &s, statep, &link);
+		io_submit_sqe(ctx, req, &s, statep, &link);
 		submitted++;
 
 		/*

From 50585b9a07367b92382c1e975265344daeba78cd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Nov 2019 01:41:07 +0300
Subject: [PATCH 33/64] io_uring: Use submit info inlined into req

Stack allocated struct sqe_submit is passed down to the submission path
along with a request (a.k.a. struct io_kiocb), and will be copied into
req->submit for async requests.

As space for it is already allocated, fill req->submit in the first
place instead of using on-stack one. As a result:

1. sqe->submit is the only place for sqe_submit and is always valid,
so we don't need to track which one to use.
2. don't need to copy in case of async
3. allows to simplify the code by not carrying it as an argument all
the way down
4. allows to reduce number of function arguments / potentially improve
spilling

The downside is that stack is most probably be cached, that's not true
for just allocated memory for a request. Another concern is cache
pollution. Though, a request would be touched and fetched along with
req->submit at some point anyway, so shouldn't be a problem.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d10216dd02bc..2b48a79848f2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2456,7 +2456,6 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
 		if (sqe_copy) {
 			s->sqe = sqe_copy;
-			memcpy(&req->submit, s, sizeof(*s));
 			if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
 				ret = io_grab_files(ctx, req);
 				if (ret) {
@@ -2591,13 +2590,11 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		}
 
 		s->sqe = sqe_copy;
-		memcpy(&req->submit, s, sizeof(*s));
 		trace_io_uring_link(ctx, req, prev);
 		list_add_tail(&req->list, &prev->link_list);
 	} else if (s->sqe->flags & IOSQE_IO_LINK) {
 		req->flags |= REQ_F_LINK;
 
-		memcpy(&req->submit, s, sizeof(*s));
 		INIT_LIST_HEAD(&req->link_list);
 		*link = req;
 	} else {
@@ -2702,8 +2699,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	}
 
 	for (i = 0; i < nr; i++) {
-		struct sqe_submit s;
 		struct io_kiocb *req;
+		unsigned int sqe_flags;
 
 		req = io_get_req(ctx, statep);
 		if (unlikely(!req)) {
@@ -2711,12 +2708,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 				submitted = -EAGAIN;
 			break;
 		}
-		if (!io_get_sqring(ctx, &s)) {
+		if (!io_get_sqring(ctx, &req->submit)) {
 			__io_free_req(req);
 			break;
 		}
 
-		if (io_sqe_needs_user(s.sqe) && !*mm) {
+		if (io_sqe_needs_user(req->submit.sqe) && !*mm) {
 			mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
 			if (!mm_fault) {
 				use_mm(ctx->sqo_mm);
@@ -2724,7 +2721,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			}
 		}
 
-		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
+		sqe_flags = req->submit.sqe->flags;
+
+		if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
 			if (!shadow_req) {
 				shadow_req = io_get_req(ctx, NULL);
 				if (unlikely(!shadow_req))
@@ -2732,24 +2731,25 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
 				refcount_dec(&shadow_req->refs);
 			}
-			shadow_req->sequence = s.sequence;
+			shadow_req->sequence = req->submit.sequence;
 		}
 
 out:
-		s.ring_file = ring_file;
-		s.ring_fd = ring_fd;
-		s.has_user = *mm != NULL;
-		s.in_async = async;
-		s.needs_fixed_file = async;
-		trace_io_uring_submit_sqe(ctx, s.sqe->user_data, true, async);
-		io_submit_sqe(ctx, req, &s, statep, &link);
+		req->submit.ring_file = ring_file;
+		req->submit.ring_fd = ring_fd;
+		req->submit.has_user = *mm != NULL;
+		req->submit.in_async = async;
+		req->submit.needs_fixed_file = async;
+		trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data,
+					  true, async);
+		io_submit_sqe(ctx, req, &req->submit, statep, &link);
 		submitted++;
 
 		/*
 		 * If previous wasn't linked and we have a linked command,
 		 * that's the end of the chain. Submit the previous link.
 		 */
-		if (!(s.sqe->flags & IOSQE_IO_LINK) && link) {
+		if (!(sqe_flags & IOSQE_IO_LINK) && link) {
 			io_queue_link_head(ctx, link, &link->submit, shadow_req);
 			link = NULL;
 			shadow_req = NULL;

From 267bc90442aa47002e2991f7d9dd141e168b466b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 7 Nov 2019 01:41:08 +0300
Subject: [PATCH 34/64] io_uring: use inlined struct sqe_submit

req->submit is always up-to-date, use it directly

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 87 +++++++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 44 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2b48a79848f2..ee33c7020828 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1155,10 +1155,9 @@ static bool io_file_supports_async(struct file *file)
 	return false;
 }
 
-static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
-		      bool force_nonblock)
+static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
 {
-	const struct io_uring_sqe *sqe = s->sqe;
+	const struct io_uring_sqe *sqe = req->submit.sqe;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
 	unsigned ioprio;
@@ -1406,8 +1405,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 	return ret;
 }
 
-static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
-		   struct io_kiocb **nxt, bool force_nonblock)
+static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
+		   bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
@@ -1416,7 +1415,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	size_t iov_count;
 	ssize_t read_size, ret;
 
-	ret = io_prep_rw(req, s, force_nonblock);
+	ret = io_prep_rw(req, force_nonblock);
 	if (ret)
 		return ret;
 	file = kiocb->ki_filp;
@@ -1424,7 +1423,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	if (unlikely(!(file->f_mode & FMODE_READ)))
 		return -EBADF;
 
-	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+	ret = io_import_iovec(req->ctx, READ, &req->submit, &iovec, &iter);
 	if (ret < 0)
 		return ret;
 
@@ -1456,7 +1455,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 			ret2 = -EAGAIN;
 		/* Catch -EAGAIN return for forced non-blocking submission */
 		if (!force_nonblock || ret2 != -EAGAIN)
-			kiocb_done(kiocb, ret2, nxt, s->in_async);
+			kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
 		else
 			ret = -EAGAIN;
 	}
@@ -1464,8 +1463,8 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	return ret;
 }
 
-static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
-		    struct io_kiocb **nxt, bool force_nonblock)
+static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
+		    bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
@@ -1474,7 +1473,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	size_t iov_count;
 	ssize_t ret;
 
-	ret = io_prep_rw(req, s, force_nonblock);
+	ret = io_prep_rw(req, force_nonblock);
 	if (ret)
 		return ret;
 
@@ -1482,7 +1481,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
-	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+	ret = io_import_iovec(req->ctx, WRITE, &req->submit, &iovec, &iter);
 	if (ret < 0)
 		return ret;
 
@@ -1519,7 +1518,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 		else
 			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
 		if (!force_nonblock || ret2 != -EAGAIN)
-			kiocb_done(kiocb, ret2, nxt, s->in_async);
+			kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
 		else
 			ret = -EAGAIN;
 	}
@@ -2188,9 +2187,9 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			const struct io_uring_sqe *sqe)
+static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req)
 {
+	const struct io_uring_sqe *sqe = req->submit.sqe;
 	struct io_uring_sqe *sqe_copy;
 
 	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
@@ -2217,10 +2216,10 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
 }
 
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			   const struct sqe_submit *s, struct io_kiocb **nxt,
-			   bool force_nonblock)
+			   struct io_kiocb **nxt, bool force_nonblock)
 {
 	int ret, opcode;
+	struct sqe_submit *s = &req->submit;
 
 	req->user_data = READ_ONCE(s->sqe->user_data);
 
@@ -2232,18 +2231,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_READV:
 		if (unlikely(s->sqe->buf_index))
 			return -EINVAL;
-		ret = io_read(req, s, nxt, force_nonblock);
+		ret = io_read(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_WRITEV:
 		if (unlikely(s->sqe->buf_index))
 			return -EINVAL;
-		ret = io_write(req, s, nxt, force_nonblock);
+		ret = io_write(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_READ_FIXED:
-		ret = io_read(req, s, nxt, force_nonblock);
+		ret = io_read(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_WRITE_FIXED:
-		ret = io_write(req, s, nxt, force_nonblock);
+		ret = io_write(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_FSYNC:
 		ret = io_fsync(req, s->sqe, nxt, force_nonblock);
@@ -2318,7 +2317,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 		s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
 		s->in_async = true;
 		do {
-			ret = __io_submit_sqe(ctx, req, s, &nxt, false);
+			ret = __io_submit_sqe(ctx, req, &nxt, false);
 			/*
 			 * We can get EAGAIN for polled IO even though we're
 			 * forcing a sync submission from here, since we can't
@@ -2372,9 +2371,10 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 	return table->files[index & IORING_FILE_TABLE_MASK];
 }
 
-static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
+static int io_req_set_file(struct io_ring_ctx *ctx,
 			   struct io_submit_state *state, struct io_kiocb *req)
 {
+	struct sqe_submit *s = &req->submit;
 	unsigned flags;
 	int fd;
 
@@ -2438,12 +2438,11 @@ static int io_grab_files(struct io_ring_ctx *ctx, struct io_kiocb *req)
 	return ret;
 }
 
-static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			struct sqe_submit *s)
+static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 {
 	int ret;
 
-	ret = __io_submit_sqe(ctx, req, s, NULL, true);
+	ret = __io_submit_sqe(ctx, req, NULL, true);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2451,6 +2450,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 */
 	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
 	    (req->flags & REQ_F_MUST_PUNT))) {
+		struct sqe_submit *s = &req->submit;
 		struct io_uring_sqe *sqe_copy;
 
 		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
@@ -2488,31 +2488,30 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	return ret;
 }
 
-static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			struct sqe_submit *s)
+static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 {
 	int ret;
 
-	ret = io_req_defer(ctx, req, s->sqe);
+	ret = io_req_defer(ctx, req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
+			io_cqring_add_event(ctx, req->submit.sqe->user_data, ret);
 			io_free_req(req, NULL);
-			io_cqring_add_event(ctx, s->sqe->user_data, ret);
 		}
 		return 0;
 	}
 
-	return __io_queue_sqe(ctx, req, s);
+	return __io_queue_sqe(ctx, req);
 }
 
 static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			      struct sqe_submit *s, struct io_kiocb *shadow)
+			      struct io_kiocb *shadow)
 {
 	int ret;
 	int need_submit = false;
 
 	if (!shadow)
-		return io_queue_sqe(ctx, req, s);
+		return io_queue_sqe(ctx, req);
 
 	/*
 	 * Mark the first IO in link list as DRAIN, let all the following
@@ -2520,12 +2519,12 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * list.
 	 */
 	req->flags |= REQ_F_IO_DRAIN;
-	ret = io_req_defer(ctx, req, s->sqe);
+	ret = io_req_defer(ctx, req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
+			io_cqring_add_event(ctx, req->submit.sqe->user_data, ret);
 			io_free_req(req, NULL);
 			__io_free_req(shadow);
-			io_cqring_add_event(ctx, s->sqe->user_data, ret);
 			return 0;
 		}
 	} else {
@@ -2543,7 +2542,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	spin_unlock_irq(&ctx->completion_lock);
 
 	if (need_submit)
-		return __io_queue_sqe(ctx, req, s);
+		return __io_queue_sqe(ctx, req);
 
 	return 0;
 }
@@ -2551,10 +2550,10 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
 
 static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			  struct sqe_submit *s, struct io_submit_state *state,
-			  struct io_kiocb **link)
+			  struct io_submit_state *state, struct io_kiocb **link)
 {
 	struct io_uring_sqe *sqe_copy;
+	struct sqe_submit *s = &req->submit;
 	int ret;
 
 	/* enforce forwards compatibility on users */
@@ -2563,11 +2562,11 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		goto err_req;
 	}
 
-	ret = io_req_set_file(ctx, s, state, req);
+	ret = io_req_set_file(ctx, state, req);
 	if (unlikely(ret)) {
 err_req:
-		io_free_req(req, NULL);
 		io_cqring_add_event(ctx, s->sqe->user_data, ret);
+		io_free_req(req, NULL);
 		return;
 	}
 
@@ -2598,7 +2597,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		INIT_LIST_HEAD(&req->link_list);
 		*link = req;
 	} else {
-		io_queue_sqe(ctx, req, s);
+		io_queue_sqe(ctx, req);
 	}
 }
 
@@ -2742,7 +2741,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		req->submit.needs_fixed_file = async;
 		trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data,
 					  true, async);
-		io_submit_sqe(ctx, req, &req->submit, statep, &link);
+		io_submit_sqe(ctx, req, statep, &link);
 		submitted++;
 
 		/*
@@ -2750,14 +2749,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		 * that's the end of the chain. Submit the previous link.
 		 */
 		if (!(sqe_flags & IOSQE_IO_LINK) && link) {
-			io_queue_link_head(ctx, link, &link->submit, shadow_req);
+			io_queue_link_head(ctx, link, shadow_req);
 			link = NULL;
 			shadow_req = NULL;
 		}
 	}
 
 	if (link)
-		io_queue_link_head(ctx, link, &link->submit, shadow_req);
+		io_queue_link_head(ctx, link, shadow_req);
 	if (statep)
 		io_submit_state_end(&state);
 

From e977d6d34f0c08e3c3b132c9e73b98d0db50abc1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 5 Nov 2019 12:39:45 -0700
Subject: [PATCH 35/64] io_uring: abstract out io_async_cancel_one() helper

We're going to need this helper in a future patch, so move it out
of io_async_cancel() and into its own separate function.

No functional changes in this patch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ee33c7020828..5360c3dd262b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2152,21 +2152,11 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
 	return req->user_data == (unsigned long) data;
 }
 
-static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-			   struct io_kiocb **nxt)
+static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	enum io_wq_cancel cancel_ret;
-	void *sqe_addr;
 	int ret = 0;
 
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
-	if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
-	    sqe->cancel_flags)
-		return -EINVAL;
-
-	sqe_addr = (void *) (unsigned long) READ_ONCE(sqe->addr);
 	cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
 	switch (cancel_ret) {
 	case IO_WQ_CANCEL_OK:
@@ -2180,6 +2170,25 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		break;
 	}
 
+	return ret;
+}
+
+static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   struct io_kiocb **nxt)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	void *sqe_addr;
+	int ret;
+
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
+	    sqe->cancel_flags)
+		return -EINVAL;
+
+	sqe_addr = (void *) (unsigned long) READ_ONCE(sqe->addr);
+	ret = io_async_cancel_one(ctx, sqe_addr);
+
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);

From 2665abfd757fb35a241c6f0b1ebf620e3ffb36fb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 5 Nov 2019 12:40:47 -0700
Subject: [PATCH 36/64] io_uring: add support for linked SQE timeouts

While we have support for generic timeouts, we don't have a way to tie
a timeout to a specific SQE. The generic timeouts simply trigger wakeups
on the CQ ring.

This adds support for IORING_OP_LINK_TIMEOUT. This command is only valid
as a link to a previous command. The timeout specific can be either
relative or absolute, following the same rules as IORING_OP_TIMEOUT. If
the timeout triggers before the dependent command completes, it will
attempt to cancel that command. Likewise, if the dependent command
completes before the timeout triggers, it will cancel the timeout.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 191 ++++++++++++++++++++++++++++++++--
 include/uapi/linux/io_uring.h |   1 +
 2 files changed, 181 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5360c3dd262b..eadd19ab39a8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -329,6 +329,7 @@ struct io_kiocb {
 #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
 #define REQ_F_IO_DRAINED	32	/* drain done */
 #define REQ_F_LINK		64	/* linked sqes */
+#define REQ_F_LINK_TIMEOUT	128	/* has linked timeout */
 #define REQ_F_FAIL_LINK		256	/* fail rest of links */
 #define REQ_F_SHADOW_DRAIN	512	/* link-drain shadow req */
 #define REQ_F_TIMEOUT		1024	/* timeout request */
@@ -371,6 +372,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 				 long res);
 static void __io_free_req(struct io_kiocb *req);
+static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr);
 
 static struct kmem_cache *req_cachep;
 
@@ -712,9 +714,28 @@ static void __io_free_req(struct io_kiocb *req)
 	kmem_cache_free(req_cachep, req);
 }
 
+static bool io_link_cancel_timeout(struct io_ring_ctx *ctx,
+				   struct io_kiocb *req)
+{
+	int ret;
+
+	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	if (ret != -1) {
+		io_cqring_fill_event(ctx, req->user_data, -ECANCELED);
+		io_commit_cqring(ctx);
+		req->flags &= ~REQ_F_LINK;
+		__io_free_req(req);
+		return true;
+	}
+
+	return false;
+}
+
 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *nxt;
+	bool wake_ev = false;
 
 	/*
 	 * The list should never be empty when we are called here. But could
@@ -722,7 +743,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	 * safe side.
 	 */
 	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
-	if (nxt) {
+	while (nxt) {
 		list_del(&nxt->list);
 		if (!list_empty(&req->link_list)) {
 			INIT_LIST_HEAD(&nxt->link_list);
@@ -734,11 +755,23 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 		 * If we're in async work, we can continue processing the chain
 		 * in this context instead of having to queue up new async work.
 		 */
-		if (nxtptr && current_work())
+		if (req->flags & REQ_F_LINK_TIMEOUT) {
+			wake_ev = io_link_cancel_timeout(ctx, nxt);
+
+			/* we dropped this link, get next */
+			nxt = list_first_entry_or_null(&req->link_list,
+							struct io_kiocb, list);
+		} else if (nxtptr && current_work()) {
 			*nxtptr = nxt;
-		else
+			break;
+		} else {
 			io_queue_async_work(req->ctx, nxt);
+			break;
+		}
 	}
+
+	if (wake_ev)
+		io_cqring_ev_posted(ctx);
 }
 
 /*
@@ -746,31 +779,61 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
  */
 static void io_fail_links(struct io_kiocb *req)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *link;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
 
 	while (!list_empty(&req->link_list)) {
 		link = list_first_entry(&req->link_list, struct io_kiocb, list);
-		list_del(&link->list);
+		list_del_init(&link->list);
 
 		trace_io_uring_fail_link(req, link);
-		io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
-		__io_free_req(link);
+
+		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
+		    link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
+			io_link_cancel_timeout(ctx, link);
+		} else {
+			io_cqring_fill_event(ctx, link->user_data, -ECANCELED);
+			__io_free_req(link);
+		}
 	}
+
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	io_cqring_ev_posted(ctx);
 }
 
 static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt)
 {
+	if (likely(!(req->flags & REQ_F_LINK))) {
+		__io_free_req(req);
+		return;
+	}
+
 	/*
 	 * If LINK is set, we have dependent requests in this chain. If we
 	 * didn't fail this request, queue the first one up, moving any other
 	 * dependencies to the next request. In case of failure, fail the rest
 	 * of the chain.
 	 */
-	if (req->flags & REQ_F_LINK) {
-		if (req->flags & REQ_F_FAIL_LINK)
-			io_fail_links(req);
-		else
-			io_req_link_next(req, nxt);
+	if (req->flags & REQ_F_FAIL_LINK) {
+		io_fail_links(req);
+	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
+		struct io_ring_ctx *ctx = req->ctx;
+		unsigned long flags;
+
+		/*
+		 * If this is a timeout link, we could be racing with the
+		 * timeout timer. Grab the completion lock for this case to
+		 * protection against that.
+		 */
+		spin_lock_irqsave(&ctx->completion_lock, flags);
+		io_req_link_next(req, nxt);
+		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	} else {
+		io_req_link_next(req, nxt);
 	}
 
 	__io_free_req(req);
@@ -2447,10 +2510,112 @@ static int io_grab_files(struct io_ring_ctx *ctx, struct io_kiocb *req)
 	return ret;
 }
 
+static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
+{
+	struct io_kiocb *req = container_of(timer, struct io_kiocb,
+						timeout.timer);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *prev = NULL;
+	unsigned long flags;
+	int ret = -ETIME;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+
+	/*
+	 * We don't expect the list to be empty, that will only happen if we
+	 * race with the completion of the linked work.
+	 */
+	if (!list_empty(&req->list)) {
+		prev = list_entry(req->list.prev, struct io_kiocb, link_list);
+		list_del_init(&req->list);
+	}
+
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	if (prev) {
+		void *user_data = (void *) (unsigned long) prev->user_data;
+		ret = io_async_cancel_one(ctx, user_data);
+	}
+
+	io_cqring_add_event(ctx, req->user_data, ret);
+	io_put_req(req, NULL);
+	return HRTIMER_NORESTART;
+}
+
+static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt)
+{
+	const struct io_uring_sqe *sqe = nxt->submit.sqe;
+	enum hrtimer_mode mode;
+	struct timespec64 ts;
+	int ret = -EINVAL;
+
+	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off)
+		goto err;
+	if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS)
+		goto err;
+	if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	req->flags |= REQ_F_LINK_TIMEOUT;
+
+	if (sqe->timeout_flags & IORING_TIMEOUT_ABS)
+		mode = HRTIMER_MODE_ABS;
+	else
+		mode = HRTIMER_MODE_REL;
+	hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, mode);
+	nxt->timeout.timer.function = io_link_timeout_fn;
+	hrtimer_start(&nxt->timeout.timer, timespec64_to_ktime(ts), mode);
+	ret = 0;
+err:
+	/* drop submission reference */
+	io_put_req(nxt, NULL);
+
+	if (ret) {
+		struct io_ring_ctx *ctx = req->ctx;
+
+		/*
+		 * Break the link and fail linked timeout, parent will get
+		 * failed by the regular submission path.
+		 */
+		list_del(&nxt->list);
+		io_cqring_fill_event(ctx, nxt->user_data, ret);
+		trace_io_uring_fail_link(req, nxt);
+		io_commit_cqring(ctx);
+		io_put_req(nxt, NULL);
+		ret = -ECANCELED;
+	}
+
+	return ret;
+}
+
+static inline struct io_kiocb *io_get_linked_timeout(struct io_kiocb *req)
+{
+	struct io_kiocb *nxt;
+
+	if (!(req->flags & REQ_F_LINK))
+		return NULL;
+
+	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
+	if (nxt && nxt->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT)
+		return nxt;
+
+	return NULL;
+}
+
 static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 {
+	struct io_kiocb *nxt;
 	int ret;
 
+	nxt = io_get_linked_timeout(req);
+	if (unlikely(nxt)) {
+		ret = io_queue_linked_timeout(req, nxt);
+		if (ret)
+			goto err;
+	}
+
 	ret = __io_submit_sqe(ctx, req, NULL, true);
 
 	/*
@@ -2605,6 +2770,10 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 		INIT_LIST_HEAD(&req->link_list);
 		*link = req;
+	} else if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
+		/* Only valid as a linked SQE */
+		ret = -EINVAL;
+		goto err_req;
 	} else {
 		io_queue_sqe(ctx, req);
 	}
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6877cf8894db..f1a118b01d18 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -72,6 +72,7 @@ struct io_uring_sqe {
 #define IORING_OP_TIMEOUT_REMOVE	12
 #define IORING_OP_ACCEPT	13
 #define IORING_OP_ASYNC_CANCEL	14
+#define IORING_OP_LINK_TIMEOUT	15
 
 /*
  * sqe->fsync_flags

From 84f97dc2333c626979bb547fce343a1003544dcc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 6 Nov 2019 11:27:53 -0700
Subject: [PATCH 37/64] io_uring: make io_cqring_events() take 'ctx' as
 argument

The rings can be derived from the ctx, and we need the ctx there for
a future change.

No functional changes in this patch.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index eadd19ab39a8..d8e15cce936e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -866,8 +866,10 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	}
 }
 
-static unsigned io_cqring_events(struct io_rings *rings)
+static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 {
+	struct io_rings *rings = ctx->rings;
+
 	/* See comment at the top of this file */
 	smp_rmb();
 	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
@@ -1023,7 +1025,7 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 		 * If we do, we can potentially be spinning for commands that
 		 * already triggered a CQE (eg in error).
 		 */
-		if (io_cqring_events(ctx->rings))
+		if (io_cqring_events(ctx))
 			break;
 
 		/*
@@ -3076,7 +3078,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
 	 * started waiting. For timeouts, we always want to return to userspace,
 	 * regardless of event count.
 	 */
-	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+	return io_cqring_events(ctx) >= iowq->to_wait ||
 			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
 }
 
@@ -3111,7 +3113,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	struct io_rings *rings = ctx->rings;
 	int ret = 0;
 
-	if (io_cqring_events(rings) >= min_events)
+	if (io_cqring_events(ctx) >= min_events)
 		return 0;
 
 	if (sig) {

From 78e19bbef38362cebff38aa1ca12e2c82bb72eb8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 6 Nov 2019 15:21:34 -0700
Subject: [PATCH 38/64] io_uring: pass in io_kiocb to fill/add CQ handlers

This is in preparation for handling CQ ring overflow a bit smarter. We
should not have any functional changes in this patch. Most of the
changes are fairly straight forward, the only ones that stick out a bit
are the ones that change __io_free_req() to take the reference count
into account. If the request hasn't been submitted yet, we know it's
safe to simply ignore references and free it. But let's clean these up
too, as later patches will depend on the caller doing the right thing if
the completion logging grabs a reference to the request.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 96 +++++++++++++++++++++++++++------------------------
 1 file changed, 50 insertions(+), 46 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d8e15cce936e..91103fc9771d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -369,10 +369,10 @@ struct io_submit_state {
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
-static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
-				 long res);
+static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void __io_free_req(struct io_kiocb *req);
 static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr);
+static void io_double_put_req(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
 
@@ -535,8 +535,8 @@ static void io_kill_timeout(struct io_kiocb *req)
 	if (ret != -1) {
 		atomic_inc(&req->ctx->cq_timeouts);
 		list_del_init(&req->list);
-		io_cqring_fill_event(req->ctx, req->user_data, 0);
-		__io_free_req(req);
+		io_cqring_fill_event(req, 0);
+		io_put_req(req, NULL);
 	}
 }
 
@@ -588,12 +588,12 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	return &rings->cqes[tail & ctx->cq_mask];
 }
 
-static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
-				 long res)
+static void io_cqring_fill_event(struct io_kiocb *req, long res)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_cqe *cqe;
 
-	trace_io_uring_complete(ctx, ki_user_data, res);
+	trace_io_uring_complete(ctx, req->user_data, res);
 
 	/*
 	 * If we can't get a cq entry, userspace overflowed the
@@ -602,7 +602,7 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 	 */
 	cqe = io_get_cqring(ctx);
 	if (cqe) {
-		WRITE_ONCE(cqe->user_data, ki_user_data);
+		WRITE_ONCE(cqe->user_data, req->user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, 0);
 	} else {
@@ -621,13 +621,13 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
-static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
-				long res)
+static void io_cqring_add_event(struct io_kiocb *req, long res)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	io_cqring_fill_event(ctx, user_data, res);
+	io_cqring_fill_event(req, res);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
@@ -721,10 +721,10 @@ static bool io_link_cancel_timeout(struct io_ring_ctx *ctx,
 
 	ret = hrtimer_try_to_cancel(&req->timeout.timer);
 	if (ret != -1) {
-		io_cqring_fill_event(ctx, req->user_data, -ECANCELED);
+		io_cqring_fill_event(req, -ECANCELED);
 		io_commit_cqring(ctx);
 		req->flags &= ~REQ_F_LINK;
-		__io_free_req(req);
+		io_put_req(req, NULL);
 		return true;
 	}
 
@@ -795,8 +795,8 @@ static void io_fail_links(struct io_kiocb *req)
 		    link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_link_cancel_timeout(ctx, link);
 		} else {
-			io_cqring_fill_event(ctx, link->user_data, -ECANCELED);
-			__io_free_req(link);
+			io_cqring_fill_event(link, -ECANCELED);
+			io_double_put_req(link);
 		}
 	}
 
@@ -866,6 +866,13 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	}
 }
 
+static void io_double_put_req(struct io_kiocb *req)
+{
+	/* drop both submit and complete references */
+	if (refcount_sub_and_test(2, &req->refs))
+		__io_free_req(req);
+}
+
 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
@@ -898,7 +905,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		req = list_first_entry(done, struct io_kiocb, list);
 		list_del(&req->list);
 
-		io_cqring_fill_event(ctx, req->user_data, req->result);
+		io_cqring_fill_event(req, req->result);
 		(*nr_events)++;
 
 		if (refcount_dec_and_test(&req->refs)) {
@@ -1094,7 +1101,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
 
 	if ((req->flags & REQ_F_LINK) && res != req->result)
 		req->flags |= REQ_F_FAIL_LINK;
-	io_cqring_add_event(req->ctx, req->user_data, res);
+	io_cqring_add_event(req, res);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@ -1595,15 +1602,14 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
-static int io_nop(struct io_kiocb *req, u64 user_data)
+static int io_nop(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	long err = 0;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 
-	io_cqring_add_event(ctx, user_data, err);
+	io_cqring_add_event(req, 0);
 	io_put_req(req, NULL);
 	return 0;
 }
@@ -1650,7 +1656,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
-	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_cqring_add_event(req, ret);
 	io_put_req(req, nxt);
 	return 0;
 }
@@ -1697,7 +1703,7 @@ static int io_sync_file_range(struct io_kiocb *req,
 
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
-	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_cqring_add_event(req, ret);
 	io_put_req(req, nxt);
 	return 0;
 }
@@ -1733,7 +1739,7 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			return ret;
 	}
 
-	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_cqring_add_event(req, ret);
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req(req, nxt);
@@ -1789,7 +1795,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
-	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_cqring_add_event(req, ret);
 	io_put_req(req, nxt);
 	return 0;
 #else
@@ -1850,7 +1856,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 
-	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_cqring_add_event(req, ret);
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req(req, NULL);
@@ -1861,7 +1867,7 @@ static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			     __poll_t mask)
 {
 	req->poll.done = true;
-	io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
+	io_cqring_fill_event(req, mangle_poll(mask));
 	io_commit_cqring(ctx);
 }
 
@@ -2055,7 +2061,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 		list_del_init(&req->list);
 	}
 
-	io_cqring_fill_event(ctx, req->user_data, -ETIME);
+	io_cqring_fill_event(req, -ETIME);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
@@ -2099,7 +2105,7 @@ static int io_timeout_remove(struct io_kiocb *req,
 	/* didn't find timeout */
 	if (ret) {
 fill_ev:
-		io_cqring_fill_event(ctx, req->user_data, ret);
+		io_cqring_fill_event(req, ret);
 		io_commit_cqring(ctx);
 		spin_unlock_irq(&ctx->completion_lock);
 		io_cqring_ev_posted(ctx);
@@ -2115,8 +2121,8 @@ static int io_timeout_remove(struct io_kiocb *req,
 		goto fill_ev;
 	}
 
-	io_cqring_fill_event(ctx, req->user_data, 0);
-	io_cqring_fill_event(ctx, treq->user_data, -ECANCELED);
+	io_cqring_fill_event(req, 0);
+	io_cqring_fill_event(treq, -ECANCELED);
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
@@ -2256,7 +2262,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
-	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_cqring_add_event(req, ret);
 	io_put_req(req, nxt);
 	return 0;
 }
@@ -2295,12 +2301,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	int ret, opcode;
 	struct sqe_submit *s = &req->submit;
 
-	req->user_data = READ_ONCE(s->sqe->user_data);
-
 	opcode = READ_ONCE(s->sqe->opcode);
 	switch (opcode) {
 	case IORING_OP_NOP:
-		ret = io_nop(req, req->user_data);
+		ret = io_nop(req);
 		break;
 	case IORING_OP_READV:
 		if (unlikely(s->sqe->buf_index))
@@ -2409,7 +2413,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 	if (ret) {
 		if (req->flags & REQ_F_LINK)
 			req->flags |= REQ_F_FAIL_LINK;
-		io_cqring_add_event(ctx, sqe->user_data, ret);
+		io_cqring_add_event(req, ret);
 		io_put_req(req, NULL);
 	}
 
@@ -2539,7 +2543,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 		ret = io_async_cancel_one(ctx, user_data);
 	}
 
-	io_cqring_add_event(ctx, req->user_data, ret);
+	io_cqring_add_event(req, ret);
 	io_put_req(req, NULL);
 	return HRTIMER_NORESTART;
 }
@@ -2582,7 +2586,7 @@ static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt)
 		 * failed by the regular submission path.
 		 */
 		list_del(&nxt->list);
-		io_cqring_fill_event(ctx, nxt->user_data, ret);
+		io_cqring_fill_event(nxt, ret);
 		trace_io_uring_fail_link(req, nxt);
 		io_commit_cqring(ctx);
 		io_put_req(nxt, NULL);
@@ -2655,7 +2659,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 
 	/* and drop final reference, if we failed */
 	if (ret) {
-		io_cqring_add_event(ctx, req->user_data, ret);
+		io_cqring_add_event(req, ret);
 		if (req->flags & REQ_F_LINK)
 			req->flags |= REQ_F_FAIL_LINK;
 		io_put_req(req, NULL);
@@ -2671,8 +2675,8 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 	ret = io_req_defer(ctx, req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
-			io_cqring_add_event(ctx, req->submit.sqe->user_data, ret);
-			io_free_req(req, NULL);
+			io_cqring_add_event(req, ret);
+			io_double_put_req(req);
 		}
 		return 0;
 	}
@@ -2698,8 +2702,8 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	ret = io_req_defer(ctx, req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
-			io_cqring_add_event(ctx, req->submit.sqe->user_data, ret);
-			io_free_req(req, NULL);
+			io_cqring_add_event(req, ret);
+			io_double_put_req(req);
 			__io_free_req(shadow);
 			return 0;
 		}
@@ -2732,6 +2736,8 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	struct sqe_submit *s = &req->submit;
 	int ret;
 
+	req->user_data = s->sqe->user_data;
+
 	/* enforce forwards compatibility on users */
 	if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
 		ret = -EINVAL;
@@ -2741,13 +2747,11 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	ret = io_req_set_file(ctx, state, req);
 	if (unlikely(ret)) {
 err_req:
-		io_cqring_add_event(ctx, s->sqe->user_data, ret);
-		io_free_req(req, NULL);
+		io_cqring_add_event(req, ret);
+		io_double_put_req(req);
 		return;
 	}
 
-	req->user_data = s->sqe->user_data;
-
 	/*
 	 * If we already have a head request, queue this one for async
 	 * submittal once the head completes. If we don't have a head but

From 1d7bb1d50fb4dc141c7431cc21fdd24ffcc83c76 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 6 Nov 2019 11:31:17 -0700
Subject: [PATCH 39/64] io_uring: add support for backlogged CQ ring

Currently we drop completion events, if the CQ ring is full. That's fine
for requests with bounded completion times, but it may make it harder or
impossible to use io_uring with networked IO where request completion
times are generally unbounded. Or with POLL, for example, which is also
unbounded.

After this patch, we never overflow the ring, we simply store requests
in a backlog for later flushing. This flushing is done automatically by
the kernel. To prevent the backlog from growing indefinitely, if the
backlog is non-empty, we apply back pressure on IO submissions. Any
attempt to submit new IO with a non-empty backlog will get an -EBUSY
return from the kernel. This is a signal to the application that it has
backlogged CQ events, and that it must reap those before being allowed
to submit more IO.

Note that if we do return -EBUSY, we will have filled whatever
backlogged events into the CQ ring first, if there's room. This means
the application can safely reap events WITHOUT entering the kernel and
waiting for them, they are already available in the CQ ring.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 125 +++++++++++++++++++++++++++-------
 include/uapi/linux/io_uring.h |   1 +
 2 files changed, 103 insertions(+), 23 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 91103fc9771d..4d89a2f222bf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -185,6 +185,7 @@ struct io_ring_ctx {
 		unsigned int		flags;
 		bool			compat;
 		bool			account_mem;
+		bool			cq_overflow_flushed;
 
 		/*
 		 * Ring buffer of indices into array of io_uring_sqe, which is
@@ -207,6 +208,7 @@ struct io_ring_ctx {
 
 		struct list_head	defer_list;
 		struct list_head	timeout_list;
+		struct list_head	cq_overflow_list;
 
 		wait_queue_head_t	inflight_wait;
 	} ____cacheline_aligned_in_smp;
@@ -414,6 +416,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 
 	ctx->flags = p->flags;
 	init_waitqueue_head(&ctx->cq_wait);
+	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	init_completion(&ctx->ctx_done);
 	init_completion(&ctx->sqo_thread_started);
 	mutex_init(&ctx->uring_lock);
@@ -588,6 +591,67 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	return &rings->cqes[tail & ctx->cq_mask];
 }
 
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+	if (waitqueue_active(&ctx->sqo_wait))
+		wake_up(&ctx->sqo_wait);
+	if (ctx->cq_ev_fd)
+		eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+{
+	struct io_rings *rings = ctx->rings;
+	struct io_uring_cqe *cqe;
+	struct io_kiocb *req;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	if (!force) {
+		if (list_empty_careful(&ctx->cq_overflow_list))
+			return;
+		if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
+		    rings->cq_ring_entries))
+			return;
+	}
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+
+	/* if force is set, the ring is going away. always drop after that */
+	if (force)
+		ctx->cq_overflow_flushed = true;
+
+	while (!list_empty(&ctx->cq_overflow_list)) {
+		cqe = io_get_cqring(ctx);
+		if (!cqe && !force)
+			break;
+
+		req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
+						list);
+		list_move(&req->list, &list);
+		if (cqe) {
+			WRITE_ONCE(cqe->user_data, req->user_data);
+			WRITE_ONCE(cqe->res, req->result);
+			WRITE_ONCE(cqe->flags, 0);
+		} else {
+			WRITE_ONCE(ctx->rings->cq_overflow,
+				atomic_inc_return(&ctx->cached_cq_overflow));
+		}
+	}
+
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	io_cqring_ev_posted(ctx);
+
+	while (!list_empty(&list)) {
+		req = list_first_entry(&list, struct io_kiocb, list);
+		list_del(&req->list);
+		io_put_req(req, NULL);
+	}
+}
+
 static void io_cqring_fill_event(struct io_kiocb *req, long res)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -601,26 +665,20 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
 	 * the ring.
 	 */
 	cqe = io_get_cqring(ctx);
-	if (cqe) {
+	if (likely(cqe)) {
 		WRITE_ONCE(cqe->user_data, req->user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, 0);
-	} else {
+	} else if (ctx->cq_overflow_flushed) {
 		WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
+	} else {
+		refcount_inc(&req->refs);
+		req->result = res;
+		list_add_tail(&req->list, &ctx->cq_overflow_list);
 	}
 }
 
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
-	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
-	if (waitqueue_active(&ctx->sqo_wait))
-		wake_up(&ctx->sqo_wait);
-	if (ctx->cq_ev_fd)
-		eventfd_signal(ctx->cq_ev_fd, 1);
-}
-
 static void io_cqring_add_event(struct io_kiocb *req, long res)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -873,10 +931,20 @@ static void io_double_put_req(struct io_kiocb *req)
 		__io_free_req(req);
 }
 
-static unsigned io_cqring_events(struct io_ring_ctx *ctx)
+static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
 {
 	struct io_rings *rings = ctx->rings;
 
+	/*
+	 * noflush == true is from the waitqueue handler, just ensure we wake
+	 * up the task, and the next invocation will flush the entries. We
+	 * cannot safely to it from here.
+	 */
+	if (noflush && !list_empty(&ctx->cq_overflow_list))
+		return -1U;
+
+	io_cqring_overflow_flush(ctx, false);
+
 	/* See comment at the top of this file */
 	smp_rmb();
 	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
@@ -1032,7 +1100,7 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 		 * If we do, we can potentially be spinning for commands that
 		 * already triggered a CQE (eg in error).
 		 */
-		if (io_cqring_events(ctx))
+		if (io_cqring_events(ctx, false))
 			break;
 
 		/*
@@ -2876,6 +2944,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	int i, submitted = 0;
 	bool mm_fault = false;
 
+	if (!list_empty(&ctx->cq_overflow_list)) {
+		io_cqring_overflow_flush(ctx, false);
+		return -EBUSY;
+	}
+
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, ctx, nr);
 		statep = &state;
@@ -2967,6 +3040,7 @@ static int io_sq_thread(void *data)
 	timeout = inflight = 0;
 	while (!kthread_should_park()) {
 		unsigned int to_submit;
+		int ret;
 
 		if (inflight) {
 			unsigned nr_events = 0;
@@ -3051,8 +3125,9 @@ static int io_sq_thread(void *data)
 		}
 
 		to_submit = min(to_submit, ctx->sq_entries);
-		inflight += io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm,
-					   true);
+		ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+		if (ret > 0)
+			inflight += ret;
 	}
 
 	set_fs(old_fs);
@@ -3073,7 +3148,7 @@ struct io_wait_queue {
 	unsigned nr_timeouts;
 };
 
-static inline bool io_should_wake(struct io_wait_queue *iowq)
+static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
 {
 	struct io_ring_ctx *ctx = iowq->ctx;
 
@@ -3082,7 +3157,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
 	 * started waiting. For timeouts, we always want to return to userspace,
 	 * regardless of event count.
 	 */
-	return io_cqring_events(ctx) >= iowq->to_wait ||
+	return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
 			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
 }
 
@@ -3092,7 +3167,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
 							wq);
 
-	if (!io_should_wake(iowq))
+	/* use noflush == true, as we can't safely rely on locking context */
+	if (!io_should_wake(iowq, true))
 		return -1;
 
 	return autoremove_wake_function(curr, mode, wake_flags, key);
@@ -3117,7 +3193,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	struct io_rings *rings = ctx->rings;
 	int ret = 0;
 
-	if (io_cqring_events(ctx) >= min_events)
+	if (io_cqring_events(ctx, false) >= min_events)
 		return 0;
 
 	if (sig) {
@@ -3138,7 +3214,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	do {
 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
-		if (io_should_wake(&iowq))
+		if (io_should_wake(&iowq, false))
 			break;
 		schedule();
 		if (signal_pending(current)) {
@@ -4061,6 +4137,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 		io_wq_cancel_all(ctx->io_wq);
 
 	io_iopoll_reap_events(ctx);
+	io_cqring_overflow_flush(ctx, true);
 	wait_for_completion(&ctx->ctx_done);
 	io_ring_ctx_free(ctx);
 }
@@ -4116,8 +4193,10 @@ static int io_uring_flush(struct file *file, void *data)
 	struct io_ring_ctx *ctx = file->private_data;
 
 	io_uring_cancel_files(ctx, data);
-	if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
+	if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
+		io_cqring_overflow_flush(ctx, true);
 		io_wq_cancel_all(ctx->io_wq);
+	}
 	return 0;
 }
 
@@ -4391,7 +4470,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 	if (ret < 0)
 		goto err;
 
-	p->features = IORING_FEAT_SINGLE_MMAP;
+	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP;
 	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
 	return ret;
 err:
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f1a118b01d18..2a1569211d87 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -155,6 +155,7 @@ struct io_uring_params {
  * io_uring_params->features flags
  */
 #define IORING_FEAT_SINGLE_MMAP		(1U << 0)
+#define IORING_FEAT_NODROP		(1U << 1)
 
 /*
  * io_uring_register(2) opcodes and arguments

From 91d666ea43adef57a6cd50c81b9603c545654981 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Nov 2019 09:17:36 -0700
Subject: [PATCH 40/64] io-wq: io_wqe_run_queue() doesn't need to use
 list_empty_careful()

We hold the wqe lock at this point (which is also annotated), so there's
no need to use the careful variant of list_empty().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index ba40a7ee31c3..9b375009a553 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -338,8 +338,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 static inline bool io_wqe_run_queue(struct io_wqe *wqe)
 	__must_hold(wqe->lock)
 {
-	if (!list_empty_careful(&wqe->work_list) &&
-	    !(wqe->flags & IO_WQE_FLAG_STALLED))
+	if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED))
 		return true;
 	return false;
 }

From c5def4ab849494d3c97f6c9fc84b2ddb868fe78c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Nov 2019 11:41:16 -0700
Subject: [PATCH 41/64] io-wq: add support for bounded vs unbunded work

io_uring supports request types that basically have two different
lifetimes:

1) Bounded completion time. These are requests like disk reads or writes,
   which we know will finish in a finite amount of time.
2) Unbounded completion time. These are generally networked IO, where we
   have no idea how long they will take to complete. Another example is
   POLL commands.

This patch provides support for io-wq to handle these differently, so we
don't starve bounded requests by tying up workers for too long. By default
all work is bounded, unless otherwise specified in the work item.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 296 ++++++++++++++++++++++++++++++++++++--------------
 fs/io-wq.h    |   4 +-
 fs/io_uring.c |   2 +-
 3 files changed, 220 insertions(+), 82 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 9b375009a553..33b14b85752b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -27,6 +27,7 @@ enum {
 	IO_WORKER_F_FREE	= 4,	/* worker on free list */
 	IO_WORKER_F_EXITING	= 8,	/* worker exiting */
 	IO_WORKER_F_FIXED	= 16,	/* static idle worker */
+	IO_WORKER_F_BOUND	= 32,	/* is doing bounded work */
 };
 
 enum {
@@ -66,6 +67,17 @@ struct io_wq_nulls_list {
 #define IO_WQ_HASH_ORDER	5
 #endif
 
+struct io_wqe_acct {
+	unsigned nr_workers;
+	unsigned max_workers;
+	atomic_t nr_running;
+};
+
+enum {
+	IO_WQ_ACCT_BOUND,
+	IO_WQ_ACCT_UNBOUND,
+};
+
 /*
  * Per-node worker thread pool
  */
@@ -78,9 +90,7 @@ struct io_wqe {
 	} ____cacheline_aligned_in_smp;
 
 	int node;
-	unsigned nr_workers;
-	unsigned max_workers;
-	atomic_t nr_running;
+	struct io_wqe_acct acct[2];
 
 	struct io_wq_nulls_list free_list;
 	struct io_wq_nulls_list busy_list;
@@ -97,6 +107,7 @@ struct io_wq {
 	unsigned nr_wqes;
 
 	struct task_struct *manager;
+	struct user_struct *user;
 	struct mm_struct *mm;
 	refcount_t refs;
 	struct completion done;
@@ -152,10 +163,29 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
 	return dropped_lock;
 }
 
+static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
+						   struct io_wq_work *work)
+{
+	if (work->flags & IO_WQ_WORK_UNBOUND)
+		return &wqe->acct[IO_WQ_ACCT_UNBOUND];
+
+	return &wqe->acct[IO_WQ_ACCT_BOUND];
+}
+
+static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
+						  struct io_worker *worker)
+{
+	if (worker->flags & IO_WORKER_F_BOUND)
+		return &wqe->acct[IO_WQ_ACCT_BOUND];
+
+	return &wqe->acct[IO_WQ_ACCT_UNBOUND];
+}
+
 static void io_worker_exit(struct io_worker *worker)
 {
 	struct io_wqe *wqe = worker->wqe;
-	bool all_done = false;
+	struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+	unsigned nr_workers;
 
 	/*
 	 * If we're not at zero, someone else is holding a brief reference
@@ -169,7 +199,9 @@ static void io_worker_exit(struct io_worker *worker)
 	preempt_disable();
 	current->flags &= ~PF_IO_WORKER;
 	if (worker->flags & IO_WORKER_F_RUNNING)
-		atomic_dec(&wqe->nr_running);
+		atomic_dec(&acct->nr_running);
+	if (!(worker->flags & IO_WORKER_F_BOUND))
+		atomic_dec(&wqe->wq->user->processes);
 	worker->flags = 0;
 	preempt_enable();
 
@@ -179,17 +211,88 @@ static void io_worker_exit(struct io_worker *worker)
 		__release(&wqe->lock);
 		spin_lock_irq(&wqe->lock);
 	}
-	wqe->nr_workers--;
-	all_done = !wqe->nr_workers;
+	acct->nr_workers--;
+	nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers +
+			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers;
 	spin_unlock_irq(&wqe->lock);
 
 	/* all workers gone, wq exit can proceed */
-	if (all_done && refcount_dec_and_test(&wqe->wq->refs))
+	if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
 		complete(&wqe->wq->done);
 
 	kfree_rcu(worker, rcu);
 }
 
+static inline bool io_wqe_run_queue(struct io_wqe *wqe)
+	__must_hold(wqe->lock)
+{
+	if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED))
+		return true;
+	return false;
+}
+
+/*
+ * Check head of free list for an available worker. If one isn't available,
+ * caller must wake up the wq manager to create one.
+ */
+static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
+	__must_hold(RCU)
+{
+	struct hlist_nulls_node *n;
+	struct io_worker *worker;
+
+	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list.head));
+	if (is_a_nulls(n))
+		return false;
+
+	worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
+	if (io_worker_get(worker)) {
+		wake_up(&worker->wait);
+		io_worker_release(worker);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * We need a worker. If we find a free one, we're good. If not, and we're
+ * below the max number of workers, wake up the manager to create one.
+ */
+static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
+{
+	bool ret;
+
+	/*
+	 * Most likely an attempt to queue unbounded work on an io_wq that
+	 * wasn't setup with any unbounded workers.
+	 */
+	WARN_ON_ONCE(!acct->max_workers);
+
+	rcu_read_lock();
+	ret = io_wqe_activate_free_worker(wqe);
+	rcu_read_unlock();
+
+	if (!ret && acct->nr_workers < acct->max_workers)
+		wake_up_process(wqe->wq->manager);
+}
+
+static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker)
+{
+	struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+
+	atomic_inc(&acct->nr_running);
+}
+
+static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker)
+	__must_hold(wqe->lock)
+{
+	struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+
+	if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
+		io_wqe_wake_worker(wqe, acct);
+}
+
 static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
 {
 	allow_kernel_signal(SIGINT);
@@ -198,7 +301,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
 
 	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
 	worker->restore_files = current->files;
-	atomic_inc(&wqe->nr_running);
+	io_wqe_inc_running(wqe, worker);
 }
 
 /*
@@ -209,6 +312,8 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
 			     struct io_wq_work *work)
 	__must_hold(wqe->lock)
 {
+	bool worker_bound, work_bound;
+
 	if (worker->flags & IO_WORKER_F_FREE) {
 		worker->flags &= ~IO_WORKER_F_FREE;
 		hlist_nulls_del_init_rcu(&worker->nulls_node);
@@ -216,6 +321,28 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
 						&wqe->busy_list.head);
 	}
 	worker->cur_work = work;
+
+	/*
+	 * If worker is moving from bound to unbound (or vice versa), then
+	 * ensure we update the running accounting.
+	 */
+	 worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
+	 work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
+	 if (worker_bound != work_bound) {
+		io_wqe_dec_running(wqe, worker);
+		if (work_bound) {
+			worker->flags |= IO_WORKER_F_BOUND;
+			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
+			wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
+			atomic_dec(&wqe->wq->user->processes);
+		} else {
+			worker->flags &= ~IO_WORKER_F_BOUND;
+			wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
+			wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
+			atomic_inc(&wqe->wq->user->processes);
+		}
+		io_wqe_inc_running(wqe, worker);
+	 }
 }
 
 /*
@@ -335,14 +462,6 @@ static void io_worker_handle_work(struct io_worker *worker)
 	} while (1);
 }
 
-static inline bool io_wqe_run_queue(struct io_wqe *wqe)
-	__must_hold(wqe->lock)
-{
-	if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED))
-		return true;
-	return false;
-}
-
 static int io_wqe_worker(void *data)
 {
 	struct io_worker *worker = data;
@@ -391,46 +510,6 @@ static int io_wqe_worker(void *data)
 	return 0;
 }
 
-/*
- * Check head of free list for an available worker. If one isn't available,
- * caller must wake up the wq manager to create one.
- */
-static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
-	__must_hold(RCU)
-{
-	struct hlist_nulls_node *n;
-	struct io_worker *worker;
-
-	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list.head));
-	if (is_a_nulls(n))
-		return false;
-
-	worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
-	if (io_worker_get(worker)) {
-		wake_up(&worker->wait);
-		io_worker_release(worker);
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * We need a worker. If we find a free one, we're good. If not, and we're
- * below the max number of workers, wake up the manager to create one.
- */
-static void io_wqe_wake_worker(struct io_wqe *wqe)
-{
-	bool ret;
-
-	rcu_read_lock();
-	ret = io_wqe_activate_free_worker(wqe);
-	rcu_read_unlock();
-
-	if (!ret && wqe->nr_workers < wqe->max_workers)
-		wake_up_process(wqe->wq->manager);
-}
-
 /*
  * Called when a worker is scheduled in. Mark us as currently running.
  */
@@ -444,7 +523,7 @@ void io_wq_worker_running(struct task_struct *tsk)
 	if (worker->flags & IO_WORKER_F_RUNNING)
 		return;
 	worker->flags |= IO_WORKER_F_RUNNING;
-	atomic_inc(&wqe->nr_running);
+	io_wqe_inc_running(wqe, worker);
 }
 
 /*
@@ -465,13 +544,13 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
 	worker->flags &= ~IO_WORKER_F_RUNNING;
 
 	spin_lock_irq(&wqe->lock);
-	if (atomic_dec_and_test(&wqe->nr_running) && io_wqe_run_queue(wqe))
-		io_wqe_wake_worker(wqe);
+	io_wqe_dec_running(wqe, worker);
 	spin_unlock_irq(&wqe->lock);
 }
 
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe)
+static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 {
+	struct io_wqe_acct *acct =&wqe->acct[index];
 	struct io_worker *worker;
 
 	worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node);
@@ -484,7 +563,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe)
 	worker->wqe = wqe;
 
 	worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
-						"io_wqe_worker-%d", wqe->node);
+				"io_wqe_worker-%d/%d", index, wqe->node);
 	if (IS_ERR(worker->task)) {
 		kfree(worker);
 		return;
@@ -493,24 +572,31 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe)
 	spin_lock_irq(&wqe->lock);
 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list.head);
 	worker->flags |= IO_WORKER_F_FREE;
-	if (!wqe->nr_workers)
+	if (index == IO_WQ_ACCT_BOUND)
+		worker->flags |= IO_WORKER_F_BOUND;
+	if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
 		worker->flags |= IO_WORKER_F_FIXED;
-	wqe->nr_workers++;
+	acct->nr_workers++;
 	spin_unlock_irq(&wqe->lock);
 
+	if (index == IO_WQ_ACCT_UNBOUND)
+		atomic_inc(&wq->user->processes);
+
 	wake_up_process(worker->task);
 }
 
-static inline bool io_wqe_need_new_worker(struct io_wqe *wqe)
+static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
 	__must_hold(wqe->lock)
 {
-	if (!wqe->nr_workers)
-		return true;
-	if (hlist_nulls_empty(&wqe->free_list.head) &&
-	    wqe->nr_workers < wqe->max_workers && io_wqe_run_queue(wqe))
-		return true;
+	struct io_wqe_acct *acct = &wqe->acct[index];
 
-	return false;
+	/* always ensure we have one bounded worker */
+	if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers)
+		return true;
+	/* if we have available workers or no work, no need */
+	if (!hlist_nulls_empty(&wqe->free_list.head) || !io_wqe_run_queue(wqe))
+		return false;
+	return acct->nr_workers < acct->max_workers;
 }
 
 /*
@@ -525,13 +611,18 @@ static int io_wq_manager(void *data)
 
 		for (i = 0; i < wq->nr_wqes; i++) {
 			struct io_wqe *wqe = wq->wqes[i];
-			bool fork_worker = false;
+			bool fork_worker[2] = { false, false };
 
 			spin_lock_irq(&wqe->lock);
-			fork_worker = io_wqe_need_new_worker(wqe);
+			if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
+				fork_worker[IO_WQ_ACCT_BOUND] = true;
+			if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
+				fork_worker[IO_WQ_ACCT_UNBOUND] = true;
 			spin_unlock_irq(&wqe->lock);
-			if (fork_worker)
-				create_io_worker(wq, wqe);
+			if (fork_worker[IO_WQ_ACCT_BOUND])
+				create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
+			if (fork_worker[IO_WQ_ACCT_UNBOUND])
+				create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
 		}
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(HZ);
@@ -540,17 +631,53 @@ static int io_wq_manager(void *data)
 	return 0;
 }
 
+static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
+			    struct io_wq_work *work)
+{
+	bool free_worker;
+
+	if (!(work->flags & IO_WQ_WORK_UNBOUND))
+		return true;
+	if (atomic_read(&acct->nr_running))
+		return true;
+
+	rcu_read_lock();
+	free_worker = !hlist_nulls_empty(&wqe->free_list.head);
+	rcu_read_unlock();
+	if (free_worker)
+		return true;
+
+	if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
+	    !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
+		return false;
+
+	return true;
+}
+
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 {
+	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
 	unsigned long flags;
 
+	/*
+	 * Do early check to see if we need a new unbound worker, and if we do,
+	 * if we're allowed to do so. This isn't 100% accurate as there's a
+	 * gap between this check and incrementing the value, but that's OK.
+	 * It's close enough to not be an issue, fork() has the same delay.
+	 */
+	if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
+		work->flags |= IO_WQ_WORK_CANCEL;
+		work->func(&work);
+		return;
+	}
+
 	spin_lock_irqsave(&wqe->lock, flags);
 	list_add_tail(&work->list, &wqe->work_list);
 	wqe->flags &= ~IO_WQE_FLAG_STALLED;
 	spin_unlock_irqrestore(&wqe->lock, flags);
 
-	if (!atomic_read(&wqe->nr_running))
-		io_wqe_wake_worker(wqe);
+	if (!atomic_read(&acct->nr_running))
+		io_wqe_wake_worker(wqe, acct);
 }
 
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
@@ -828,7 +955,8 @@ void io_wq_flush(struct io_wq *wq)
 	}
 }
 
-struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm)
+struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
+			   struct user_struct *user)
 {
 	int ret = -ENOMEM, i, node;
 	struct io_wq *wq;
@@ -844,6 +972,9 @@ struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm)
 		return ERR_PTR(-ENOMEM);
 	}
 
+	/* caller must already hold a reference to this */
+	wq->user = user;
+
 	i = 0;
 	refcount_set(&wq->refs, wq->nr_wqes);
 	for_each_online_node(node) {
@@ -854,7 +985,13 @@ struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm)
 			break;
 		wq->wqes[i] = wqe;
 		wqe->node = node;
-		wqe->max_workers = concurrency;
+		wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
+		atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
+		if (user) {
+			wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
+					task_rlimit(current, RLIMIT_NPROC);
+		}
+		atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
 		wqe->node = node;
 		wqe->wq = wq;
 		spin_lock_init(&wqe->lock);
@@ -863,7 +1000,6 @@ struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm)
 		wqe->free_list.nulls = 0;
 		INIT_HLIST_NULLS_HEAD(&wqe->busy_list.head, 1);
 		wqe->busy_list.nulls = 1;
-		atomic_set(&wqe->nr_running, 0);
 
 		i++;
 	}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 3de192dc73fc..8cb345256f35 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -9,6 +9,7 @@ enum {
 	IO_WQ_WORK_HASHED	= 4,
 	IO_WQ_WORK_NEEDS_USER	= 8,
 	IO_WQ_WORK_NEEDS_FILES	= 16,
+	IO_WQ_WORK_UNBOUND	= 32,
 
 	IO_WQ_HASH_SHIFT	= 24,	/* upper 8 bits are used for hash key */
 };
@@ -33,7 +34,8 @@ struct io_wq_work {
 		(work)->files = NULL;			\
 	} while (0)					\
 
-struct io_wq *io_wq_create(unsigned concurrency, struct mm_struct *mm);
+struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
+				struct user_struct *user);
 void io_wq_destroy(struct io_wq *wq);
 
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4d89a2f222bf..831bea0fbc75 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3745,7 +3745,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 
 	/* Do QD, or 4 * CPUS, whatever is smallest */
 	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-	ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm);
+	ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, NULL);
 	if (IS_ERR(ctx->io_wq)) {
 		ret = PTR_ERR(ctx->io_wq);
 		ctx->io_wq = NULL;

From 5f8fd2d3e0a7aa7fc9d97226be24286edd289835 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Nov 2019 10:57:36 -0700
Subject: [PATCH 42/64] io_uring: properly mark async work as bounded vs
 unbounded

Now that io-wq supports separating the two request lifetime types, mark
the following IO as having unbounded runtimes:

- Any read/write to a non-regular file
- Any specific networked IO
- Any poll command

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 831bea0fbc75..02a4f5e8a6e4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -506,6 +506,20 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
 		case IORING_OP_WRITEV:
 		case IORING_OP_WRITE_FIXED:
 			do_hashed = true;
+			/* fall-through */
+		case IORING_OP_READV:
+		case IORING_OP_READ_FIXED:
+		case IORING_OP_SENDMSG:
+		case IORING_OP_RECVMSG:
+		case IORING_OP_ACCEPT:
+		case IORING_OP_POLL_ADD:
+			/*
+			 * We know REQ_F_ISREG is not set on some of these
+			 * opcodes, but this enables us to keep the check in
+			 * just one place.
+			 */
+			if (!(req->flags & REQ_F_ISREG))
+				req->work.flags |= IO_WQ_WORK_UNBOUND;
 			break;
 		}
 		if (io_sqe_needs_user(req->submit.sqe))
@@ -3745,7 +3759,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 
 	/* Do QD, or 4 * CPUS, whatever is smallest */
 	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-	ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, NULL);
+	ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user);
 	if (IS_ERR(ctx->io_wq)) {
 		ret = PTR_ERR(ctx->io_wq);
 		ctx->io_wq = NULL;

From 206aefde4f886fdeb3b6339aacab3a85fb74cb7e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 7 Nov 2019 18:27:42 -0700
Subject: [PATCH 43/64] io_uring: reduce/pack size of io_ring_ctx

With the recent flurry of additions and changes to io_uring, the
layout of io_ring_ctx has become a bit stale. We're right now at
704 bytes in size on my x86-64 build, or 11 cachelines. This
patch does two things:

- We have to completion structs embedded, that we only use for
  quiesce of the ctx (or shutdown) and for sqthread init cases.
  That 2x32 bytes right there, let's dynamically allocate them.

- Reorder the struct a bit with an eye on cachelines, use cases,
  and holes.

With this patch, we're down to 512 bytes, or 8 cachelines.

Reviewed-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 69 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 02a4f5e8a6e4..710eb27bf379 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -204,6 +204,7 @@ struct io_ring_ctx {
 		unsigned		sq_mask;
 		unsigned		sq_thread_idle;
 		unsigned		cached_sq_dropped;
+		atomic_t		cached_cq_overflow;
 		struct io_uring_sqe	*sq_sqes;
 
 		struct list_head	defer_list;
@@ -213,25 +214,13 @@ struct io_ring_ctx {
 		wait_queue_head_t	inflight_wait;
 	} ____cacheline_aligned_in_smp;
 
+	struct io_rings	*rings;
+
 	/* IO offload */
 	struct io_wq		*io_wq;
 	struct task_struct	*sqo_thread;	/* if using sq thread polling */
 	struct mm_struct	*sqo_mm;
 	wait_queue_head_t	sqo_wait;
-	struct completion	sqo_thread_started;
-
-	struct {
-		unsigned		cached_cq_tail;
-		atomic_t		cached_cq_overflow;
-		unsigned		cq_entries;
-		unsigned		cq_mask;
-		struct wait_queue_head	cq_wait;
-		struct fasync_struct	*cq_fasync;
-		struct eventfd_ctx	*cq_ev_fd;
-		atomic_t		cq_timeouts;
-	} ____cacheline_aligned_in_smp;
-
-	struct io_rings	*rings;
 
 	/*
 	 * If used, fixed file set. Writers must ensure that ->refs is dead,
@@ -247,7 +236,22 @@ struct io_ring_ctx {
 
 	struct user_struct	*user;
 
-	struct completion	ctx_done;
+	/* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
+	struct completion	*completions;
+
+#if defined(CONFIG_UNIX)
+	struct socket		*ring_sock;
+#endif
+
+	struct {
+		unsigned		cached_cq_tail;
+		unsigned		cq_entries;
+		unsigned		cq_mask;
+		atomic_t		cq_timeouts;
+		struct wait_queue_head	cq_wait;
+		struct fasync_struct	*cq_fasync;
+		struct eventfd_ctx	*cq_ev_fd;
+	} ____cacheline_aligned_in_smp;
 
 	struct {
 		struct mutex		uring_lock;
@@ -269,10 +273,6 @@ struct io_ring_ctx {
 		spinlock_t		inflight_lock;
 		struct list_head	inflight_list;
 	} ____cacheline_aligned_in_smp;
-
-#if defined(CONFIG_UNIX)
-	struct socket		*ring_sock;
-#endif
 };
 
 struct sqe_submit {
@@ -397,7 +397,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 {
 	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 
-	complete(&ctx->ctx_done);
+	complete(&ctx->completions[0]);
 }
 
 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
@@ -408,17 +408,19 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	if (!ctx)
 		return NULL;
 
+	ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
+	if (!ctx->completions)
+		goto err;
+
 	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
-			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
-		kfree(ctx);
-		return NULL;
-	}
+			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+		goto err;
 
 	ctx->flags = p->flags;
 	init_waitqueue_head(&ctx->cq_wait);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
-	init_completion(&ctx->ctx_done);
-	init_completion(&ctx->sqo_thread_started);
+	init_completion(&ctx->completions[0]);
+	init_completion(&ctx->completions[1]);
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
@@ -430,6 +432,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	spin_lock_init(&ctx->inflight_lock);
 	INIT_LIST_HEAD(&ctx->inflight_list);
 	return ctx;
+err:
+	kfree(ctx->completions);
+	kfree(ctx);
+	return NULL;
 }
 
 static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
@@ -3046,7 +3052,7 @@ static int io_sq_thread(void *data)
 	unsigned inflight;
 	unsigned long timeout;
 
-	complete(&ctx->sqo_thread_started);
+	complete(&ctx->completions[1]);
 
 	old_fs = get_fs();
 	set_fs(USER_DS);
@@ -3286,7 +3292,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
 {
 	if (ctx->sqo_thread) {
-		wait_for_completion(&ctx->sqo_thread_started);
+		wait_for_completion(&ctx->completions[1]);
 		/*
 		 * The park is a bit of a work-around, without it we get
 		 * warning spews on shutdown with SQPOLL set and affinity
@@ -4108,6 +4114,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		io_unaccount_mem(ctx->user,
 				ring_pages(ctx->sq_entries, ctx->cq_entries));
 	free_uid(ctx->user);
+	kfree(ctx->completions);
 	kfree(ctx);
 }
 
@@ -4152,7 +4159,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 
 	io_iopoll_reap_events(ctx);
 	io_cqring_overflow_flush(ctx, true);
-	wait_for_completion(&ctx->ctx_done);
+	wait_for_completion(&ctx->completions[0]);
 	io_ring_ctx_free(ctx);
 }
 
@@ -4555,7 +4562,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	 * no new references will come in after we've killed the percpu ref.
 	 */
 	mutex_unlock(&ctx->uring_lock);
-	wait_for_completion(&ctx->ctx_done);
+	wait_for_completion(&ctx->completions[0]);
 	mutex_lock(&ctx->uring_lock);
 
 	switch (opcode) {
@@ -4598,7 +4605,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	}
 
 	/* bring the ctx back to life */
-	reinit_completion(&ctx->ctx_done);
+	reinit_completion(&ctx->completions[0]);
 	percpu_ref_reinit(&ctx->refs);
 	return ret;
 }

From a197f664a0db8a6219d9ce949f5f29b89f60fb2b Mon Sep 17 00:00:00 2001
From: Jackie Liu <liuyun01@kylinos.cn>
Date: Fri, 8 Nov 2019 08:09:12 -0700
Subject: [PATCH 44/64] io_uring: remove passed in 'ctx' function parameter ctx
 if possible

Many times, the core of the function is req, and req has already set
req->ctx at initialization time, so there is no need to pass in the
ctx from the caller.

Cleanup, no functional change.

Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 108 ++++++++++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 52 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 710eb27bf379..147d1f0e13cc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -438,20 +438,20 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	return NULL;
 }
 
-static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
-				       struct io_kiocb *req)
+static inline bool __io_sequence_defer(struct io_kiocb *req)
 {
+	struct io_ring_ctx *ctx = req->ctx;
+
 	return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
 					+ atomic_read(&ctx->cached_cq_overflow);
 }
 
-static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
-				     struct io_kiocb *req)
+static inline bool io_sequence_defer(struct io_kiocb *req)
 {
 	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
 		return false;
 
-	return __io_sequence_defer(ctx, req);
+	return __io_sequence_defer(req);
 }
 
 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -459,7 +459,7 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 	struct io_kiocb *req;
 
 	req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
-	if (req && !io_sequence_defer(ctx, req)) {
+	if (req && !io_sequence_defer(req)) {
 		list_del_init(&req->list);
 		return req;
 	}
@@ -472,7 +472,7 @@ static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 	struct io_kiocb *req;
 
 	req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
-	if (req && !__io_sequence_defer(ctx, req)) {
+	if (req && !__io_sequence_defer(req)) {
 		list_del_init(&req->list);
 		return req;
 	}
@@ -535,10 +535,10 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
 	return do_hashed;
 }
 
-static inline void io_queue_async_work(struct io_ring_ctx *ctx,
-				       struct io_kiocb *req)
+static inline void io_queue_async_work(struct io_kiocb *req)
 {
 	bool do_hashed = io_prep_async_work(req);
+	struct io_ring_ctx *ctx = req->ctx;
 
 	trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
 					req->flags);
@@ -589,7 +589,7 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
 			continue;
 		}
 		req->flags |= REQ_F_IO_DRAINED;
-		io_queue_async_work(ctx, req);
+		io_queue_async_work(req);
 	}
 }
 
@@ -792,9 +792,9 @@ static void __io_free_req(struct io_kiocb *req)
 	kmem_cache_free(req_cachep, req);
 }
 
-static bool io_link_cancel_timeout(struct io_ring_ctx *ctx,
-				   struct io_kiocb *req)
+static bool io_link_cancel_timeout(struct io_kiocb *req)
 {
+	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
 	ret = hrtimer_try_to_cancel(&req->timeout.timer);
@@ -834,7 +834,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 		 * in this context instead of having to queue up new async work.
 		 */
 		if (req->flags & REQ_F_LINK_TIMEOUT) {
-			wake_ev = io_link_cancel_timeout(ctx, nxt);
+			wake_ev = io_link_cancel_timeout(nxt);
 
 			/* we dropped this link, get next */
 			nxt = list_first_entry_or_null(&req->link_list,
@@ -843,7 +843,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 			*nxtptr = nxt;
 			break;
 		} else {
-			io_queue_async_work(req->ctx, nxt);
+			io_queue_async_work(nxt);
 			break;
 		}
 	}
@@ -871,7 +871,7 @@ static void io_fail_links(struct io_kiocb *req)
 
 		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
 		    link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
-			io_link_cancel_timeout(ctx, link);
+			io_link_cancel_timeout(link);
 		} else {
 			io_cqring_fill_event(link, -ECANCELED);
 			io_double_put_req(link);
@@ -940,7 +940,7 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr)
 		if (nxtptr)
 			*nxtptr = nxt;
 		else
-			io_queue_async_work(nxt->ctx, nxt);
+			io_queue_async_work(nxt);
 	}
 }
 
@@ -1899,7 +1899,7 @@ static void io_poll_remove_one(struct io_kiocb *req)
 	WRITE_ONCE(poll->canceled, true);
 	if (!list_empty(&poll->wait.entry)) {
 		list_del_init(&poll->wait.entry);
-		io_queue_async_work(req->ctx, req);
+		io_queue_async_work(req);
 	}
 	spin_unlock(&poll->head->lock);
 
@@ -1951,9 +1951,10 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			     __poll_t mask)
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
 {
+	struct io_ring_ctx *ctx = req->ctx;
+
 	req->poll.done = true;
 	io_cqring_fill_event(req, mangle_poll(mask));
 	io_commit_cqring(ctx);
@@ -1989,7 +1990,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 		return;
 	}
 	list_del_init(&req->list);
-	io_poll_complete(ctx, req, mask);
+	io_poll_complete(req, mask);
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_ev_posted(ctx);
@@ -2017,13 +2018,13 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 
 	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
 		list_del(&req->list);
-		io_poll_complete(ctx, req, mask);
+		io_poll_complete(req, mask);
 		spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 		io_cqring_ev_posted(ctx);
 		io_put_req(req, NULL);
 	} else {
-		io_queue_async_work(ctx, req);
+		io_queue_async_work(req);
 	}
 
 	return 1;
@@ -2108,7 +2109,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 	if (mask) { /* no async, we'd stolen it */
 		ipt.error = 0;
-		io_poll_complete(ctx, req, mask);
+		io_poll_complete(req, mask);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 
@@ -2355,12 +2356,13 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req)
+static int io_req_defer(struct io_kiocb *req)
 {
 	const struct io_uring_sqe *sqe = req->submit.sqe;
 	struct io_uring_sqe *sqe_copy;
+	struct io_ring_ctx *ctx = req->ctx;
 
-	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
+	if (!io_sequence_defer(req) && list_empty(&ctx->defer_list))
 		return 0;
 
 	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
@@ -2368,7 +2370,7 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req)
 		return -EAGAIN;
 
 	spin_lock_irq(&ctx->completion_lock);
-	if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
+	if (!io_sequence_defer(req) && list_empty(&ctx->defer_list)) {
 		spin_unlock_irq(&ctx->completion_lock);
 		kfree(sqe_copy);
 		return 0;
@@ -2383,11 +2385,12 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req)
 	return -EIOCBQUEUED;
 }
 
-static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			   struct io_kiocb **nxt, bool force_nonblock)
+static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
+			   bool force_nonblock)
 {
 	int ret, opcode;
 	struct sqe_submit *s = &req->submit;
+	struct io_ring_ctx *ctx = req->ctx;
 
 	opcode = READ_ONCE(s->sqe->opcode);
 	switch (opcode) {
@@ -2467,7 +2470,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 {
 	struct io_wq_work *work = *workptr;
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-	struct io_ring_ctx *ctx = req->ctx;
 	struct sqe_submit *s = &req->submit;
 	const struct io_uring_sqe *sqe = s->sqe;
 	struct io_kiocb *nxt = NULL;
@@ -2483,7 +2485,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 		s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
 		s->in_async = true;
 		do {
-			ret = __io_submit_sqe(ctx, req, &nxt, false);
+			ret = __io_submit_sqe(req, &nxt, false);
 			/*
 			 * We can get EAGAIN for polled IO even though we're
 			 * forcing a sync submission from here, since we can't
@@ -2537,10 +2539,10 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 	return table->files[index & IORING_FILE_TABLE_MASK];
 }
 
-static int io_req_set_file(struct io_ring_ctx *ctx,
-			   struct io_submit_state *state, struct io_kiocb *req)
+static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
 {
 	struct sqe_submit *s = &req->submit;
+	struct io_ring_ctx *ctx = req->ctx;
 	unsigned flags;
 	int fd;
 
@@ -2580,9 +2582,10 @@ static int io_req_set_file(struct io_ring_ctx *ctx,
 	return 0;
 }
 
-static int io_grab_files(struct io_ring_ctx *ctx, struct io_kiocb *req)
+static int io_grab_files(struct io_kiocb *req)
 {
 	int ret = -EBADF;
+	struct io_ring_ctx *ctx = req->ctx;
 
 	rcu_read_lock();
 	spin_lock_irq(&ctx->inflight_lock);
@@ -2698,7 +2701,7 @@ static inline struct io_kiocb *io_get_linked_timeout(struct io_kiocb *req)
 	return NULL;
 }
 
-static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
+static int __io_queue_sqe(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt;
 	int ret;
@@ -2710,7 +2713,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 			goto err;
 	}
 
-	ret = __io_submit_sqe(ctx, req, NULL, true);
+	ret = __io_submit_sqe(req, NULL, true);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2725,7 +2728,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 		if (sqe_copy) {
 			s->sqe = sqe_copy;
 			if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
-				ret = io_grab_files(ctx, req);
+				ret = io_grab_files(req);
 				if (ret) {
 					kfree(sqe_copy);
 					goto err;
@@ -2736,7 +2739,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 			 * Queued up for async execution, worker will release
 			 * submit reference when the iocb is actually submitted.
 			 */
-			io_queue_async_work(ctx, req);
+			io_queue_async_work(req);
 			return 0;
 		}
 	}
@@ -2756,11 +2759,11 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 	return ret;
 }
 
-static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
+static int io_queue_sqe(struct io_kiocb *req)
 {
 	int ret;
 
-	ret = io_req_defer(ctx, req);
+	ret = io_req_defer(req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
 			io_cqring_add_event(req, ret);
@@ -2769,17 +2772,17 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req)
 		return 0;
 	}
 
-	return __io_queue_sqe(ctx, req);
+	return __io_queue_sqe(req);
 }
 
-static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			      struct io_kiocb *shadow)
+static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 {
 	int ret;
 	int need_submit = false;
+	struct io_ring_ctx *ctx = req->ctx;
 
 	if (!shadow)
-		return io_queue_sqe(ctx, req);
+		return io_queue_sqe(req);
 
 	/*
 	 * Mark the first IO in link list as DRAIN, let all the following
@@ -2787,7 +2790,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * list.
 	 */
 	req->flags |= REQ_F_IO_DRAIN;
-	ret = io_req_defer(ctx, req);
+	ret = io_req_defer(req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
 			io_cqring_add_event(req, ret);
@@ -2810,18 +2813,19 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	spin_unlock_irq(&ctx->completion_lock);
 
 	if (need_submit)
-		return __io_queue_sqe(ctx, req);
+		return __io_queue_sqe(req);
 
 	return 0;
 }
 
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
 
-static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			  struct io_submit_state *state, struct io_kiocb **link)
+static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
+			  struct io_kiocb **link)
 {
 	struct io_uring_sqe *sqe_copy;
 	struct sqe_submit *s = &req->submit;
+	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
 	req->user_data = s->sqe->user_data;
@@ -2832,7 +2836,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		goto err_req;
 	}
 
-	ret = io_req_set_file(ctx, state, req);
+	ret = io_req_set_file(state, req);
 	if (unlikely(ret)) {
 err_req:
 		io_cqring_add_event(req, ret);
@@ -2869,7 +2873,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		ret = -EINVAL;
 		goto err_req;
 	} else {
-		io_queue_sqe(ctx, req);
+		io_queue_sqe(req);
 	}
 }
 
@@ -3018,7 +3022,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		req->submit.needs_fixed_file = async;
 		trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data,
 					  true, async);
-		io_submit_sqe(ctx, req, statep, &link);
+		io_submit_sqe(req, statep, &link);
 		submitted++;
 
 		/*
@@ -3026,14 +3030,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		 * that's the end of the chain. Submit the previous link.
 		 */
 		if (!(sqe_flags & IOSQE_IO_LINK) && link) {
-			io_queue_link_head(ctx, link, shadow_req);
+			io_queue_link_head(link, shadow_req);
 			link = NULL;
 			shadow_req = NULL;
 		}
 	}
 
 	if (link)
-		io_queue_link_head(ctx, link, shadow_req);
+		io_queue_link_head(link, shadow_req);
 	if (statep)
 		io_submit_state_end(&state);
 

From ec9c02ad4c3808d6d9ed28ad1d0485d6e2a33ac5 Mon Sep 17 00:00:00 2001
From: Jackie Liu <liuyun01@kylinos.cn>
Date: Fri, 8 Nov 2019 23:50:36 +0800
Subject: [PATCH 45/64] io_uring: keep io_put_req only responsible for release
 and put req

We already have io_put_req_find_next to find the next req of the link.
we should not use the io_put_req function to find them. They should be
functions of the same level.

Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 73 ++++++++++++++++++++++++++-------------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 147d1f0e13cc..1597838d5073 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -373,7 +373,7 @@ struct io_submit_state {
 static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void __io_free_req(struct io_kiocb *req);
-static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr);
+static void io_put_req(struct io_kiocb *req);
 static void io_double_put_req(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
@@ -559,7 +559,7 @@ static void io_kill_timeout(struct io_kiocb *req)
 		atomic_inc(&req->ctx->cq_timeouts);
 		list_del_init(&req->list);
 		io_cqring_fill_event(req, 0);
-		io_put_req(req, NULL);
+		io_put_req(req);
 	}
 }
 
@@ -668,7 +668,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	while (!list_empty(&list)) {
 		req = list_first_entry(&list, struct io_kiocb, list);
 		list_del(&req->list);
-		io_put_req(req, NULL);
+		io_put_req(req);
 	}
 }
 
@@ -802,7 +802,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
 		io_cqring_fill_event(req, -ECANCELED);
 		io_commit_cqring(ctx);
 		req->flags &= ~REQ_F_LINK;
-		io_put_req(req, NULL);
+		io_put_req(req);
 		return true;
 	}
 
@@ -921,21 +921,13 @@ static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt)
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.
  */
-static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
+static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 {
 	struct io_kiocb *nxt = NULL;
 
 	if (refcount_dec_and_test(&req->refs))
 		io_free_req(req, &nxt);
 
-	return nxt;
-}
-
-static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr)
-{
-	struct io_kiocb *nxt;
-
-	nxt = io_put_req_find_next(req);
 	if (nxt) {
 		if (nxtptr)
 			*nxtptr = nxt;
@@ -944,6 +936,12 @@ static void io_put_req(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	}
 }
 
+static void io_put_req(struct io_kiocb *req)
+{
+	if (refcount_dec_and_test(&req->refs))
+		io_free_req(req, NULL);
+}
+
 static void io_double_put_req(struct io_kiocb *req)
 {
 	/* drop both submit and complete references */
@@ -1197,15 +1195,18 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 
 	io_complete_rw_common(kiocb, res);
-	io_put_req(req, NULL);
+	io_put_req(req);
 }
 
 static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+	struct io_kiocb *nxt = NULL;
 
 	io_complete_rw_common(kiocb, res);
-	return io_put_req_find_next(req);
+	io_put_req_find_next(req, &nxt);
+
+	return nxt;
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -1698,7 +1699,7 @@ static int io_nop(struct io_kiocb *req)
 		return -EINVAL;
 
 	io_cqring_add_event(req, 0);
-	io_put_req(req, NULL);
+	io_put_req(req);
 	return 0;
 }
 
@@ -1745,7 +1746,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, ret);
-	io_put_req(req, nxt);
+	io_put_req_find_next(req, nxt);
 	return 0;
 }
 
@@ -1792,7 +1793,7 @@ static int io_sync_file_range(struct io_kiocb *req,
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, ret);
-	io_put_req(req, nxt);
+	io_put_req_find_next(req, nxt);
 	return 0;
 }
 
@@ -1830,7 +1831,7 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	io_cqring_add_event(req, ret);
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
-	io_put_req(req, nxt);
+	io_put_req_find_next(req, nxt);
 	return 0;
 }
 #endif
@@ -1884,7 +1885,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, ret);
-	io_put_req(req, nxt);
+	io_put_req_find_next(req, nxt);
 	return 0;
 #else
 	return -EOPNOTSUPP;
@@ -1947,7 +1948,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	io_cqring_add_event(req, ret);
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
-	io_put_req(req, NULL);
+	io_put_req(req);
 	return 0;
 }
 
@@ -1995,7 +1996,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 
 	io_cqring_ev_posted(ctx);
 
-	io_put_req(req, &nxt);
+	io_put_req_find_next(req, &nxt);
 	if (nxt)
 		*workptr = &nxt->work;
 }
@@ -2022,7 +2023,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 		io_cqring_ev_posted(ctx);
-		io_put_req(req, NULL);
+		io_put_req(req);
 	} else {
 		io_queue_async_work(req);
 	}
@@ -2115,7 +2116,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	if (mask) {
 		io_cqring_ev_posted(ctx);
-		io_put_req(req, nxt);
+		io_put_req_find_next(req, nxt);
 	}
 	return ipt.error;
 }
@@ -2157,7 +2158,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	io_cqring_ev_posted(ctx);
 	if (req->flags & REQ_F_LINK)
 		req->flags |= REQ_F_FAIL_LINK;
-	io_put_req(req, NULL);
+	io_put_req(req);
 	return HRTIMER_NORESTART;
 }
 
@@ -2200,7 +2201,7 @@ static int io_timeout_remove(struct io_kiocb *req,
 		io_cqring_ev_posted(ctx);
 		if (req->flags & REQ_F_LINK)
 			req->flags |= REQ_F_FAIL_LINK;
-		io_put_req(req, NULL);
+		io_put_req(req);
 		return 0;
 	}
 
@@ -2216,8 +2217,8 @@ static int io_timeout_remove(struct io_kiocb *req,
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
 
-	io_put_req(treq, NULL);
-	io_put_req(req, NULL);
+	io_put_req(treq);
+	io_put_req(req);
 	return 0;
 }
 
@@ -2352,7 +2353,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, ret);
-	io_put_req(req, nxt);
+	io_put_req_find_next(req, nxt);
 	return 0;
 }
 
@@ -2498,13 +2499,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 	}
 
 	/* drop submission reference */
-	io_put_req(req, NULL);
+	io_put_req(req);
 
 	if (ret) {
 		if (req->flags & REQ_F_LINK)
 			req->flags |= REQ_F_FAIL_LINK;
 		io_cqring_add_event(req, ret);
-		io_put_req(req, NULL);
+		io_put_req(req);
 	}
 
 	/* async context always use a copy of the sqe */
@@ -2635,7 +2636,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	}
 
 	io_cqring_add_event(req, ret);
-	io_put_req(req, NULL);
+	io_put_req(req);
 	return HRTIMER_NORESTART;
 }
 
@@ -2667,7 +2668,7 @@ static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt)
 	ret = 0;
 err:
 	/* drop submission reference */
-	io_put_req(nxt, NULL);
+	io_put_req(nxt);
 
 	if (ret) {
 		struct io_ring_ctx *ctx = req->ctx;
@@ -2680,7 +2681,7 @@ static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt)
 		io_cqring_fill_event(nxt, ret);
 		trace_io_uring_fail_link(req, nxt);
 		io_commit_cqring(ctx);
-		io_put_req(nxt, NULL);
+		io_put_req(nxt);
 		ret = -ECANCELED;
 	}
 
@@ -2746,14 +2747,14 @@ static int __io_queue_sqe(struct io_kiocb *req)
 
 	/* drop submission reference */
 err:
-	io_put_req(req, NULL);
+	io_put_req(req);
 
 	/* and drop final reference, if we failed */
 	if (ret) {
 		io_cqring_add_event(req, ret);
 		if (req->flags & REQ_F_LINK)
 			req->flags |= REQ_F_FAIL_LINK;
-		io_put_req(req, NULL);
+		io_put_req(req);
 	}
 
 	return ret;

From c69f8dbe2426cbf6150407b7e86ce85bb463c1dc Mon Sep 17 00:00:00 2001
From: Jackie Liu <liuyun01@kylinos.cn>
Date: Sat, 9 Nov 2019 11:00:08 +0800
Subject: [PATCH 46/64] io_uring: separate the io_free_req and
 io_free_req_find_next interface

Similar to the distinction between io_put_req and io_put_req_find_next,
io_free_req has been modified similarly, with no functional changes.

Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1597838d5073..2b4257a965db 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -883,7 +883,7 @@ static void io_fail_links(struct io_kiocb *req)
 	io_cqring_ev_posted(ctx);
 }
 
-static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 {
 	if (likely(!(req->flags & REQ_F_LINK))) {
 		__io_free_req(req);
@@ -917,6 +917,11 @@ static void io_free_req(struct io_kiocb *req, struct io_kiocb **nxt)
 	__io_free_req(req);
 }
 
+static void io_free_req(struct io_kiocb *req)
+{
+	io_free_req_find_next(req, NULL);
+}
+
 /*
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.
@@ -926,7 +931,7 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	struct io_kiocb *nxt = NULL;
 
 	if (refcount_dec_and_test(&req->refs))
-		io_free_req(req, &nxt);
+		io_free_req_find_next(req, &nxt);
 
 	if (nxt) {
 		if (nxtptr)
@@ -939,7 +944,7 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 static void io_put_req(struct io_kiocb *req)
 {
 	if (refcount_dec_and_test(&req->refs))
-		io_free_req(req, NULL);
+		io_free_req(req);
 }
 
 static void io_double_put_req(struct io_kiocb *req)
@@ -1006,7 +1011,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 				if (to_free == ARRAY_SIZE(reqs))
 					io_free_req_many(ctx, reqs, &to_free);
 			} else {
-				io_free_req(req, NULL);
+				io_free_req(req);
 			}
 		}
 	}

From 46568e9be70ff8211d986685f08d919376c32998 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 10 Nov 2019 08:40:53 -0700
Subject: [PATCH 47/64] io_uring: fix error clear of ->file_table in
 io_sqe_files_register()

syzbot reports that when using failslab and friends, we can get a double
free in io_sqe_files_unregister():

BUG: KASAN: double-free or invalid-free in
io_sqe_files_unregister+0x20b/0x300 fs/io_uring.c:3185

CPU: 1 PID: 8819 Comm: syz-executor452 Not tainted 5.4.0-rc6-next-20191108
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x197/0x210 lib/dump_stack.c:118
  print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374
  kasan_report_invalid_free+0x65/0xa0 mm/kasan/report.c:468
  __kasan_slab_free+0x13a/0x150 mm/kasan/common.c:450
  kasan_slab_free+0xe/0x10 mm/kasan/common.c:480
  __cache_free mm/slab.c:3426 [inline]
  kfree+0x10a/0x2c0 mm/slab.c:3757
  io_sqe_files_unregister+0x20b/0x300 fs/io_uring.c:3185
  io_ring_ctx_free fs/io_uring.c:3998 [inline]
  io_ring_ctx_wait_and_kill+0x348/0x700 fs/io_uring.c:4060
  io_uring_release+0x42/0x50 fs/io_uring.c:4068
  __fput+0x2ff/0x890 fs/file_table.c:280
  ____fput+0x16/0x20 fs/file_table.c:313
  task_work_run+0x145/0x1c0 kernel/task_work.c:113
  exit_task_work include/linux/task_work.h:22 [inline]
  do_exit+0x904/0x2e60 kernel/exit.c:817
  do_group_exit+0x135/0x360 kernel/exit.c:921
  __do_sys_exit_group kernel/exit.c:932 [inline]
  __se_sys_exit_group kernel/exit.c:930 [inline]
  __x64_sys_exit_group+0x44/0x50 kernel/exit.c:930
  do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x43f2c8
Code: 31 b8 c5 f7 ff ff 48 8b 5c 24 28 48 8b 6c 24 30 4c 8b 64 24 38 4c 8b
6c 24 40 4c 8b 74 24 48 4c 8b 7c 24 50 48 83 c4 58 c3 66 <0f> 1f 84 00 00
00 00 00 48 8d 35 59 ca 00 00 0f b6 d2 48 89 fb 48
RSP: 002b:00007ffd5b976008 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000000000043f2c8
RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000
RBP: 00000000004bf0a8 R08: 00000000000000e7 R09: ffffffffffffffd0
R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001
R13: 00000000006d1180 R14: 0000000000000000 R15: 0000000000000000

This happens if we fail allocating the file tables. For that case we do
free the file table correctly, but we forget to set it to NULL. This
means that ring teardown will see it as being non-NULL, and attempt to
free it again.

Fix this by clearing the file_table pointer if we free the table.

Reported-by: syzbot+3254bc44113ae1e331ee@syzkaller.appspotmail.com
Fixes: 65e19f54d29c ("io_uring: support for larger fixed file sets")
Reviewed-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2b4257a965db..737c311c6da5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3488,6 +3488,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 
 	if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
 		kfree(ctx->file_table);
+		ctx->file_table = NULL;
 		return -ENOMEM;
 	}
 

From 8e3cca12706231daf8daf90dbde59f1665135e48 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 9 Nov 2019 19:52:33 -0700
Subject: [PATCH 48/64] io_uring: convert accept4() -ERESTARTSYS into -EINTR

If we cancel a pending accept operating with a signal, we get
-ERESTARTSYS returned. Turn that into -EINTR for userspace, we should
not be return -ERESTARTSYS.

Fixes: 17f2fe35d080 ("io_uring: add support for IORING_OP_ACCEPT")
Reported-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 737c311c6da5..bbb3889dae41 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1887,6 +1887,8 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
 		return -EAGAIN;
 	}
+	if (ret == -ERESTARTSYS)
+		ret = -EINTR;
 	if (ret < 0 && (req->flags & REQ_F_LINK))
 		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_add_event(req, ret);

From 0ddf92e848ab7abf216f218ee363eb9b9650e98f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 8 Nov 2019 08:52:53 -0700
Subject: [PATCH 49/64] io_uring: provide fallback request for OOM situations

One thing that really sucks for userspace APIs is if the kernel passes
back -ENOMEM/-EAGAIN for resource shortages. The application really has
no idea of what to do in those cases. Should it try and reap
completions? Probably a good idea. Will it solve the issue? Who knows.

This patch adds a simple fallback mechanism if we fail to allocate
memory for a request. If we fail allocating memory from the slab for a
request, we punt to a pre-allocated request. There's just one of these
per io_ring_ctx, but the important part is if we ever return -EBUSY to
the application, the applications knows that it can wait for events and
make forward progress when events have completed. This is the important
part.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 46 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bbb3889dae41..2c838baf11b4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -239,6 +239,9 @@ struct io_ring_ctx {
 	/* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
 	struct completion	*completions;
 
+	/* if all else fails... */
+	struct io_kiocb		*fallback_req;
+
 #if defined(CONFIG_UNIX)
 	struct socket		*ring_sock;
 #endif
@@ -408,6 +411,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	if (!ctx)
 		return NULL;
 
+	ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
+	if (!ctx->fallback_req)
+		goto err;
+
 	ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
 	if (!ctx->completions)
 		goto err;
@@ -433,6 +440,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->inflight_list);
 	return ctx;
 err:
+	if (ctx->fallback_req)
+		kmem_cache_free(req_cachep, ctx->fallback_req);
 	kfree(ctx->completions);
 	kfree(ctx);
 	return NULL;
@@ -712,6 +721,23 @@ static void io_cqring_add_event(struct io_kiocb *req, long res)
 	io_cqring_ev_posted(ctx);
 }
 
+static inline bool io_is_fallback_req(struct io_kiocb *req)
+{
+	return req == (struct io_kiocb *)
+			((unsigned long) req->ctx->fallback_req & ~1UL);
+}
+
+static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req;
+
+	req = ctx->fallback_req;
+	if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
+		return req;
+
+	return NULL;
+}
+
 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 				   struct io_submit_state *state)
 {
@@ -724,7 +750,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	if (!state) {
 		req = kmem_cache_alloc(req_cachep, gfp);
 		if (unlikely(!req))
-			goto out;
+			goto fallback;
 	} else if (!state->free_reqs) {
 		size_t sz;
 		int ret;
@@ -739,7 +765,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 		if (unlikely(ret <= 0)) {
 			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 			if (!state->reqs[0])
-				goto out;
+				goto fallback;
 			ret = 1;
 		}
 		state->free_reqs = ret - 1;
@@ -751,6 +777,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 		state->cur_req++;
 	}
 
+got_it:
 	req->file = NULL;
 	req->ctx = ctx;
 	req->flags = 0;
@@ -759,7 +786,10 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	req->result = 0;
 	INIT_IO_WORK(&req->work, io_wq_submit_work);
 	return req;
-out:
+fallback:
+	req = io_get_fallback_req(ctx);
+	if (req)
+		goto got_it;
 	percpu_ref_put(&ctx->refs);
 	return NULL;
 }
@@ -789,7 +819,10 @@ static void __io_free_req(struct io_kiocb *req)
 		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
 	}
 	percpu_ref_put(&ctx->refs);
-	kmem_cache_free(req_cachep, req);
+	if (likely(!io_is_fallback_req(req)))
+		kmem_cache_free(req_cachep, req);
+	else
+		clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
 }
 
 static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1005,8 +1038,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			 * completions for those, only batch free for fixed
 			 * file and non-linked commands.
 			 */
-			if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
-			    REQ_F_FIXED_FILE) {
+			if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+			    REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
 				reqs[to_free++] = req;
 				if (to_free == ARRAY_SIZE(reqs))
 					io_free_req_many(ctx, reqs, &to_free);
@@ -4128,6 +4161,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 				ring_pages(ctx->sq_entries, ctx->cq_entries));
 	free_uid(ctx->user);
 	kfree(ctx->completions);
+	kmem_cache_free(req_cachep, ctx->fallback_req);
 	kfree(ctx);
 }
 

From 47f467686ec02fc07fd5c6bb34b6f6736e2884b0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 9 Nov 2019 17:43:02 -0700
Subject: [PATCH 50/64] io_uring: make ASYNC_CANCEL work with poll and timeout

It's a little confusing that we have multiple types of command
cancellation opcodes now that we have a generic one. Make the generic
one work with POLL_ADD and TIMEOUT commands as well, that makes for an
easier to use API for the application. The fact that they currently
don't is a bit confusing.

Add a helper that takes care of it, so we can user it from both
IORING_OP_ASYNC_CANCEL and from the linked timeout cancellation.

Reported-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 138 +++++++++++++++++++++++++++++---------------------
 1 file changed, 80 insertions(+), 58 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2c838baf11b4..b70982502336 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1959,6 +1959,20 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
 	spin_unlock_irq(&ctx->completion_lock);
 }
 
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+{
+	struct io_kiocb *req;
+
+	list_for_each_entry(req, &ctx->cancel_list, list) {
+		if (req->user_data != sqe_addr)
+			continue;
+		io_poll_remove_one(req);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
 /*
  * Find a running poll command that matches one specified in sqe->addr,
  * and remove it if found.
@@ -1966,8 +1980,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
 static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_kiocb *poll_req, *next;
-	int ret = -ENOENT;
+	int ret;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -1976,13 +1989,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->completion_lock);
-	list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
-		if (READ_ONCE(sqe->addr) == poll_req->user_data) {
-			io_poll_remove_one(poll_req);
-			ret = 0;
-			break;
-		}
-	}
+	ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr));
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_add_event(req, ret);
@@ -2202,6 +2209,31 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+{
+	struct io_kiocb *req;
+	int ret = -ENOENT;
+
+	list_for_each_entry(req, &ctx->timeout_list, list) {
+		if (user_data == req->user_data) {
+			list_del_init(&req->list);
+			ret = 0;
+			break;
+		}
+	}
+
+	if (ret == -ENOENT)
+		return ret;
+
+	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	if (ret == -1)
+		return -EALREADY;
+
+	io_cqring_fill_event(req, -ECANCELED);
+	io_put_req(req);
+	return 0;
+}
+
 /*
  * Remove or update an existing timeout command
  */
@@ -2209,10 +2241,8 @@ static int io_timeout_remove(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_kiocb *treq;
-	int ret = -ENOENT;
-	__u64 user_data;
 	unsigned flags;
+	int ret;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -2222,42 +2252,15 @@ static int io_timeout_remove(struct io_kiocb *req,
 	if (flags)
 		return -EINVAL;
 
-	user_data = READ_ONCE(sqe->addr);
 	spin_lock_irq(&ctx->completion_lock);
-	list_for_each_entry(treq, &ctx->timeout_list, list) {
-		if (user_data == treq->user_data) {
-			list_del_init(&treq->list);
-			ret = 0;
-			break;
-		}
-	}
+	ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr));
 
-	/* didn't find timeout */
-	if (ret) {
-fill_ev:
-		io_cqring_fill_event(req, ret);
-		io_commit_cqring(ctx);
-		spin_unlock_irq(&ctx->completion_lock);
-		io_cqring_ev_posted(ctx);
-		if (req->flags & REQ_F_LINK)
-			req->flags |= REQ_F_FAIL_LINK;
-		io_put_req(req);
-		return 0;
-	}
-
-	ret = hrtimer_try_to_cancel(&treq->timeout.timer);
-	if (ret == -1) {
-		ret = -EBUSY;
-		goto fill_ev;
-	}
-
-	io_cqring_fill_event(req, 0);
-	io_cqring_fill_event(treq, -ECANCELED);
+	io_cqring_fill_event(req, ret);
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
 	io_cqring_ev_posted(ctx);
-
-	io_put_req(treq);
+	if (ret < 0 && req->flags & REQ_F_LINK)
+		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req(req);
 	return 0;
 }
@@ -2374,12 +2377,39 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
 	return ret;
 }
 
+static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
+				     struct io_kiocb *req, __u64 sqe_addr,
+				     struct io_kiocb **nxt)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+	if (ret != -ENOENT) {
+		spin_lock_irqsave(&ctx->completion_lock, flags);
+		goto done;
+	}
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	ret = io_timeout_cancel(ctx, sqe_addr);
+	if (ret != -ENOENT)
+		goto done;
+	ret = io_poll_cancel(ctx, sqe_addr);
+done:
+	io_cqring_fill_event(req, ret);
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	io_cqring_ev_posted(ctx);
+
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
+	io_put_req_find_next(req, nxt);
+}
+
 static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			   struct io_kiocb **nxt)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	void *sqe_addr;
-	int ret;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -2387,13 +2417,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	    sqe->cancel_flags)
 		return -EINVAL;
 
-	sqe_addr = (void *) (unsigned long) READ_ONCE(sqe->addr);
-	ret = io_async_cancel_one(ctx, sqe_addr);
-
-	if (ret < 0 && (req->flags & REQ_F_LINK))
-		req->flags |= REQ_F_FAIL_LINK;
-	io_cqring_add_event(req, ret);
-	io_put_req_find_next(req, nxt);
+	io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL);
 	return 0;
 }
 
@@ -2655,7 +2679,6 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *prev = NULL;
 	unsigned long flags;
-	int ret = -ETIME;
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 
@@ -2671,12 +2694,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	if (prev) {
-		void *user_data = (void *) (unsigned long) prev->user_data;
-		ret = io_async_cancel_one(ctx, user_data);
+		io_async_find_and_cancel(ctx, req, prev->user_data, NULL);
+	} else {
+		io_cqring_add_event(req, -ETIME);
+		io_put_req(req);
 	}
-
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
 	return HRTIMER_NORESTART;
 }
 

From c1edbf5f081be9fbbea68c1d564b773e59c1acf3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 10 Nov 2019 16:56:04 -0700
Subject: [PATCH 51/64] io_uring: flag SQPOLL busy condition to userspace

Now that we have backpressure, for SQPOLL, we have one more condition
that warrants flagging that the application needs to enter the kernel:
we failed to submit IO due to backpressure. Make sure we catch that
and flag it appropriately.

If we run into backpressure issues with the SQPOLL thread, flag it
as such to the application by setting IORING_SQ_NEED_WAKEUP. This will
cause the application to enter the kernel, and that will flush the
backlog and clear the condition.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b70982502336..912d2648f8db 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3118,16 +3118,16 @@ static int io_sq_thread(void *data)
 	DEFINE_WAIT(wait);
 	unsigned inflight;
 	unsigned long timeout;
+	int ret;
 
 	complete(&ctx->completions[1]);
 
 	old_fs = get_fs();
 	set_fs(USER_DS);
 
-	timeout = inflight = 0;
+	ret = timeout = inflight = 0;
 	while (!kthread_should_park()) {
 		unsigned int to_submit;
-		int ret;
 
 		if (inflight) {
 			unsigned nr_events = 0;
@@ -3161,13 +3161,21 @@ static int io_sq_thread(void *data)
 		}
 
 		to_submit = io_sqring_entries(ctx);
-		if (!to_submit) {
+
+		/*
+		 * If submit got -EBUSY, flag us as needing the application
+		 * to enter the kernel to reap and flush events.
+		 */
+		if (!to_submit || ret == -EBUSY) {
 			/*
 			 * We're polling. If we're within the defined idle
 			 * period, then let us spin without work before going
-			 * to sleep.
+			 * to sleep. The exception is if we got EBUSY doing
+			 * more IO, we should wait for the application to
+			 * reap events and wake us up.
 			 */
-			if (inflight || !time_after(jiffies, timeout)) {
+			if (inflight ||
+			    (!time_after(jiffies, timeout) && ret != -EBUSY)) {
 				cond_resched();
 				continue;
 			}
@@ -3193,7 +3201,7 @@ static int io_sq_thread(void *data)
 			smp_mb();
 
 			to_submit = io_sqring_entries(ctx);
-			if (!to_submit) {
+			if (!to_submit || ret == -EBUSY) {
 				if (kthread_should_park()) {
 					finish_wait(&ctx->sqo_wait, &wait);
 					break;
@@ -4351,6 +4359,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	 */
 	ret = 0;
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		if (!list_empty_careful(&ctx->cq_overflow_list))
+			io_cqring_overflow_flush(ctx, false);
 		if (flags & IORING_ENTER_SQ_WAKEUP)
 			wake_up(&ctx->sqo_wait);
 		submitted = to_submit;

From 768134d4f48109b90f4248feecbeeb7d684e410c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 10 Nov 2019 20:30:53 -0700
Subject: [PATCH 52/64] io_uring: don't do flush cancel under inflight_lock

We can't safely cancel under the inflight lock. If the work hasn't been
started yet, then io_wq_cancel_work() simply marks the work as cancelled
and invokes the work handler. But if the work completion needs to grab
the inflight lock because it's grabbing user files, then we'll deadlock
trying to finish the work as we already hold that lock.

Instead grab a reference to the request, if it isn't already zero. If
it's zero, then we know it's going through completion anyway, and we
can safely ignore it. If it's not zero, then we can drop the lock and
attempt to cancel from there.

This also fixes a missing finish_wait() at the end of
io_uring_cancel_files().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 912d2648f8db..dcb0602c9fd2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4257,33 +4257,34 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 
 	while (!list_empty_careful(&ctx->inflight_list)) {
 		enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
+		struct io_kiocb *cancel_req = NULL;
 
 		spin_lock_irq(&ctx->inflight_lock);
 		list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
-			if (req->work.files == files) {
-				ret = io_wq_cancel_work(ctx->io_wq, &req->work);
-				break;
-			}
+			if (req->work.files != files)
+				continue;
+			/* req is being completed, ignore */
+			if (!refcount_inc_not_zero(&req->refs))
+				continue;
+			cancel_req = req;
+			break;
 		}
-		if (ret == IO_WQ_CANCEL_RUNNING)
+		if (cancel_req)
 			prepare_to_wait(&ctx->inflight_wait, &wait,
-					TASK_UNINTERRUPTIBLE);
-
+						TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(&ctx->inflight_lock);
 
-		/*
-		 * We need to keep going until we get NOTFOUND. We only cancel
-		 * one work at the time.
-		 *
-		 * If we get CANCEL_RUNNING, then wait for a work to complete
-		 * before continuing.
-		 */
-		if (ret == IO_WQ_CANCEL_OK)
-			continue;
-		else if (ret != IO_WQ_CANCEL_RUNNING)
+		if (cancel_req) {
+			ret = io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
+			io_put_req(cancel_req);
+		}
+
+		/* We need to keep going until we don't find a matching req */
+		if (!cancel_req)
 			break;
 		schedule();
 	}
+	finish_wait(&ctx->inflight_wait, &wait);
 }
 
 static int io_uring_flush(struct file *file, void *data)

From 76a46e066e2d93bd333599d1c84c605c2c4cc909 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 10 Nov 2019 23:34:16 -0700
Subject: [PATCH 53/64] io_uring: fix -ENOENT issue with linked timer with
 short timeout

If you prep a read (for example) that needs to get punted to async
context with a timer, if the timeout is sufficiently short, the timer
request will get completed with -ENOENT as it could not find the read.

The issue is that we prep and start the timer before we start the read.
Hence the timer can trigger before the read is even started, and the end
result is then that the timer completes with -ENOENT, while the read
starts instead of being cancelled by the timer.

Fix this by splitting the linked timer into two parts:

1) Prep and validate the linked timer
2) Start timer

The read is then started between steps 1 and 2, so we know that the
timer will always have a consistent view of the read request state.

Reported-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 127 +++++++++++++++++++++++++++++---------------------
 1 file changed, 74 insertions(+), 53 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dcb0602c9fd2..ad7f569319c2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -855,7 +855,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	 */
 	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
 	while (nxt) {
-		list_del(&nxt->list);
+		list_del_init(&nxt->list);
 		if (!list_empty(&req->link_list)) {
 			INIT_LIST_HEAD(&nxt->link_list);
 			list_splice(&req->link_list, &nxt->link_list);
@@ -2688,13 +2688,17 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	 */
 	if (!list_empty(&req->list)) {
 		prev = list_entry(req->list.prev, struct io_kiocb, link_list);
-		list_del_init(&req->list);
+		if (refcount_inc_not_zero(&prev->refs))
+			list_del_init(&req->list);
+		else
+			prev = NULL;
 	}
 
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	if (prev) {
 		io_async_find_and_cancel(ctx, req, prev->user_data, NULL);
+		io_put_req(prev);
 	} else {
 		io_cqring_add_event(req, -ETIME);
 		io_put_req(req);
@@ -2702,78 +2706,84 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static int io_queue_linked_timeout(struct io_kiocb *req, struct io_kiocb *nxt)
+static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
+				    enum hrtimer_mode *mode)
 {
-	const struct io_uring_sqe *sqe = nxt->submit.sqe;
-	enum hrtimer_mode mode;
-	struct timespec64 ts;
-	int ret = -EINVAL;
+	struct io_ring_ctx *ctx = req->ctx;
 
-	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off)
-		goto err;
-	if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS)
-		goto err;
-	if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) {
-		ret = -EFAULT;
-		goto err;
+	/*
+	 * If the list is now empty, then our linked request finished before
+	 * we got a chance to setup the timer
+	 */
+	spin_lock_irq(&ctx->completion_lock);
+	if (!list_empty(&req->list)) {
+		req->timeout.timer.function = io_link_timeout_fn;
+		hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts),
+				*mode);
 	}
+	spin_unlock_irq(&ctx->completion_lock);
 
-	req->flags |= REQ_F_LINK_TIMEOUT;
-
-	if (sqe->timeout_flags & IORING_TIMEOUT_ABS)
-		mode = HRTIMER_MODE_ABS;
-	else
-		mode = HRTIMER_MODE_REL;
-	hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, mode);
-	nxt->timeout.timer.function = io_link_timeout_fn;
-	hrtimer_start(&nxt->timeout.timer, timespec64_to_ktime(ts), mode);
-	ret = 0;
-err:
 	/* drop submission reference */
-	io_put_req(nxt);
-
-	if (ret) {
-		struct io_ring_ctx *ctx = req->ctx;
-
-		/*
-		 * Break the link and fail linked timeout, parent will get
-		 * failed by the regular submission path.
-		 */
-		list_del(&nxt->list);
-		io_cqring_fill_event(nxt, ret);
-		trace_io_uring_fail_link(req, nxt);
-		io_commit_cqring(ctx);
-		io_put_req(nxt);
-		ret = -ECANCELED;
-	}
-
-	return ret;
+	io_put_req(req);
 }
 
-static inline struct io_kiocb *io_get_linked_timeout(struct io_kiocb *req)
+static int io_validate_link_timeout(const struct io_uring_sqe *sqe,
+				    struct timespec64 *ts)
+{
+	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off)
+		return -EINVAL;
+	if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS)
+		return -EINVAL;
+	if (get_timespec64(ts, u64_to_user_ptr(sqe->addr)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
+					       struct timespec64 *ts,
+					       enum hrtimer_mode *mode)
 {
 	struct io_kiocb *nxt;
+	int ret;
 
 	if (!(req->flags & REQ_F_LINK))
 		return NULL;
 
 	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
-	if (nxt && nxt->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT)
-		return nxt;
+	if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT)
+		return NULL;
 
-	return NULL;
+	ret = io_validate_link_timeout(nxt->submit.sqe, ts);
+	if (ret) {
+		list_del_init(&nxt->list);
+		io_cqring_add_event(nxt, ret);
+		io_double_put_req(nxt);
+		return ERR_PTR(-ECANCELED);
+	}
+
+	if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS)
+		*mode = HRTIMER_MODE_ABS;
+	else
+		*mode = HRTIMER_MODE_REL;
+
+	req->flags |= REQ_F_LINK_TIMEOUT;
+	hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode);
+	return nxt;
 }
 
 static int __io_queue_sqe(struct io_kiocb *req)
 {
+	enum hrtimer_mode mode;
 	struct io_kiocb *nxt;
+	struct timespec64 ts;
 	int ret;
 
-	nxt = io_get_linked_timeout(req);
-	if (unlikely(nxt)) {
-		ret = io_queue_linked_timeout(req, nxt);
-		if (ret)
-			goto err;
+	nxt = io_prep_linked_timeout(req, &ts, &mode);
+	if (IS_ERR(nxt)) {
+		ret = PTR_ERR(nxt);
+		nxt = NULL;
+		goto err;
 	}
 
 	ret = __io_submit_sqe(req, NULL, true);
@@ -2803,14 +2813,25 @@ static int __io_queue_sqe(struct io_kiocb *req)
 			 * submit reference when the iocb is actually submitted.
 			 */
 			io_queue_async_work(req);
+
+			if (nxt)
+				io_queue_linked_timeout(nxt, &ts, &mode);
+
 			return 0;
 		}
 	}
 
-	/* drop submission reference */
 err:
+	/* drop submission reference */
 	io_put_req(req);
 
+	if (nxt) {
+		if (!ret)
+			io_queue_linked_timeout(nxt, &ts, &mode);
+		else
+			io_put_req(nxt);
+	}
+
 	/* and drop final reference, if we failed */
 	if (ret) {
 		io_cqring_add_event(req, ret);

From 960e432dfa5927892a9b170d14de874597b84849 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 12 Nov 2019 07:56:39 -0700
Subject: [PATCH 54/64] io_uring: use correct "is IO worker" helper

Since we switched to io-wq, the dependent link optimization for when to
pass back work inline has been broken. Fix this by providing a suitable
io-wq helper for io_uring to use to detect when to do this.

Fixes: 561fb04a6a22 ("io_uring: replace workqueue usage with io-wq")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.h    | 4 ++++
 fs/io_uring.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 8cb345256f35..cc50754d028c 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -62,4 +62,8 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
 }
 #endif
 
+static inline bool io_wq_current_is_worker(void)
+{
+	return in_task() && (current->flags & PF_IO_WORKER);
+}
 #endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index ad7f569319c2..3c573f0578a8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -872,7 +872,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 			/* we dropped this link, get next */
 			nxt = list_first_entry_or_null(&req->link_list,
 							struct io_kiocb, list);
-		} else if (nxtptr && current_work()) {
+		} else if (nxtptr && io_wq_current_is_worker()) {
 			*nxtptr = nxt;
 			break;
 		} else {

From 7c9e7f0fe0d8abf856a957c150c48778e75154c1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 12 Nov 2019 08:15:53 -0700
Subject: [PATCH 55/64] io_uring: fix potential deadlock in io_poll_wake()

We attempt to run the poll completion inline, but we're using trylock to
do so. This avoids a deadlock since we're grabbing the locks in reverse
order at this point, we already hold the poll wq lock and we're trying
to grab the completion lock, while the normal rules are the reverse of
that order.

IO completion for a timeout link will need to grab the completion lock,
but that's not safe from this context. Put the completion under the
completion_lock in io_poll_wake(), and mark the request as entering
the completion with the completion_lock already held.

Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3c573f0578a8..247e5e1137a3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -341,6 +341,7 @@ struct io_kiocb {
 #define REQ_F_ISREG		2048	/* regular file */
 #define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
 #define REQ_F_INFLIGHT		8192	/* on inflight list */
+#define REQ_F_COMP_LOCKED	16384	/* completion under lock */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -931,14 +932,15 @@ static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 	 */
 	if (req->flags & REQ_F_FAIL_LINK) {
 		io_fail_links(req);
-	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
+	} else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
+			REQ_F_LINK_TIMEOUT) {
 		struct io_ring_ctx *ctx = req->ctx;
 		unsigned long flags;
 
 		/*
 		 * If this is a timeout link, we could be racing with the
 		 * timeout timer. Grab the completion lock for this case to
-		 * protection against that.
+		 * protect against that.
 		 */
 		spin_lock_irqsave(&ctx->completion_lock, flags);
 		io_req_link_next(req, nxt);
@@ -2064,13 +2066,20 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 
 	list_del_init(&poll->wait.entry);
 
+	/*
+	 * Run completion inline if we can. We're using trylock here because
+	 * we are violating the completion_lock -> poll wq lock ordering.
+	 * If we have a link timeout we're going to need the completion_lock
+	 * for finalizing the request, mark us as having grabbed that already.
+	 */
 	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
 		list_del(&req->list);
 		io_poll_complete(req, mask);
+		req->flags |= REQ_F_COMP_LOCKED;
+		io_put_req(req);
 		spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 		io_cqring_ev_posted(ctx);
-		io_put_req(req);
 	} else {
 		io_queue_async_work(req);
 	}

From 15dff286d0e0087d4dcd7049911f179e4e4cfd94 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 13 Nov 2019 09:09:23 -0700
Subject: [PATCH 56/64] io_uring: check for validity of ->rings in teardown

Normally the rings are always valid, the exception is if we failed to
allocate the rings at setup time. syzbot reports this:

RSP: 002b:00007ffd6e8aa078 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000441229
RDX: 0000000000000002 RSI: 0000000020000140 RDI: 0000000000000d0d
RBP: 00007ffd6e8aa090 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 8903 Comm: syz-executor410 Not tainted 5.4.0-rc7-next-20191113
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:__read_once_size include/linux/compiler.h:199 [inline]
RIP: 0010:__io_commit_cqring fs/io_uring.c:496 [inline]
RIP: 0010:io_commit_cqring+0x1e1/0xdb0 fs/io_uring.c:592
Code: 03 0f 8e df 09 00 00 48 8b 45 d0 4c 8d a3 c0 00 00 00 4c 89 e2 48 c1
ea 03 44 8b b8 c0 01 00 00 48 b8 00 00 00 00 00 fc ff df <0f> b6 14 02 4c
89 e0 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 61
RSP: 0018:ffff88808f51fc08 EFLAGS: 00010006
RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff815abe4a
RDX: 0000000000000018 RSI: ffffffff81d168d5 RDI: ffff8880a9166100
RBP: ffff88808f51fc70 R08: 0000000000000004 R09: ffffed1011ea3f7d
R10: ffffed1011ea3f7c R11: 0000000000000003 R12: 00000000000000c0
R13: ffff8880a91661c0 R14: 1ffff1101522cc10 R15: 0000000000000000
FS:  0000000001e7a880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000140 CR3: 000000009a74c000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  io_cqring_overflow_flush+0x6b9/0xa90 fs/io_uring.c:673
  io_ring_ctx_wait_and_kill+0x24f/0x7c0 fs/io_uring.c:4260
  io_uring_create fs/io_uring.c:4600 [inline]
  io_uring_setup+0x1256/0x1cc0 fs/io_uring.c:4626
  __do_sys_io_uring_setup fs/io_uring.c:4639 [inline]
  __se_sys_io_uring_setup fs/io_uring.c:4636 [inline]
  __x64_sys_io_uring_setup+0x54/0x80 fs/io_uring.c:4636
  do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x441229
Code: e8 5c ae 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
ff 0f 83 bb 0a fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffd6e8aa078 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000441229
RDX: 0000000000000002 RSI: 0000000020000140 RDI: 0000000000000d0d
RBP: 00007ffd6e8aa090 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000
Modules linked in:
---[ end trace b0f5b127a57f623f ]---
RIP: 0010:__read_once_size include/linux/compiler.h:199 [inline]
RIP: 0010:__io_commit_cqring fs/io_uring.c:496 [inline]
RIP: 0010:io_commit_cqring+0x1e1/0xdb0 fs/io_uring.c:592
Code: 03 0f 8e df 09 00 00 48 8b 45 d0 4c 8d a3 c0 00 00 00 4c 89 e2 48 c1
ea 03 44 8b b8 c0 01 00 00 48 b8 00 00 00 00 00 fc ff df <0f> b6 14 02 4c
89 e0 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 61
RSP: 0018:ffff88808f51fc08 EFLAGS: 00010006
RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff815abe4a
RDX: 0000000000000018 RSI: ffffffff81d168d5 RDI: ffff8880a9166100
RBP: ffff88808f51fc70 R08: 0000000000000004 R09: ffffed1011ea3f7d
R10: ffffed1011ea3f7c R11: 0000000000000003 R12: 00000000000000c0
R13: ffff8880a91661c0 R14: 1ffff1101522cc10 R15: 0000000000000000
FS:  0000000001e7a880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000140 CR3: 000000009a74c000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

which is exactly the case of failing to allocate the SQ/CQ rings, and
then entering shutdown. Check if the rings are valid before trying to
access them at shutdown time.

Reported-by: syzbot+21147d79607d724bd6f3@syzkaller.appspotmail.com
Fixes: 1d7bb1d50fb4 ("io_uring: add support for backlogged CQ ring")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 247e5e1137a3..99822bf89924 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4265,7 +4265,9 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 		io_wq_cancel_all(ctx->io_wq);
 
 	io_iopoll_reap_events(ctx);
-	io_cqring_overflow_flush(ctx, true);
+	/* if we failed setting up the ctx, we might not have any rings */
+	if (ctx->rings)
+		io_cqring_overflow_flush(ctx, true);
 	wait_for_completion(&ctx->completions[0]);
 	io_ring_ctx_free(ctx);
 }

From 7d7230652e7c788ef908536fd79f4cca077f269f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 12 Nov 2019 22:31:31 -0700
Subject: [PATCH 57/64] io_wq: add get/put_work handlers to io_wq_create()

For cancellation, we need to ensure that the work item stays valid for
as long as ->cur_work is valid. Right now we can't safely dereference
the work item even under the wqe->lock, because while the ->cur_work
pointer will remain valid, the work could be completing and be freed
in parallel.

Only invoke ->get/put_work() on items we know that the caller queued
themselves. Add IO_WQ_WORK_INTERNAL for io-wq to use, which is needed
when we're queueing a flush item, for instance.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 25 +++++++++++++++++++++++--
 fs/io-wq.h    |  7 ++++++-
 fs/io_uring.c | 17 ++++++++++++++++-
 3 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 33b14b85752b..26d81540c1fc 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -106,6 +106,9 @@ struct io_wq {
 	unsigned long state;
 	unsigned nr_wqes;
 
+	get_work_fn *get_work;
+	put_work_fn *put_work;
+
 	struct task_struct *manager;
 	struct user_struct *user;
 	struct mm_struct *mm;
@@ -392,7 +395,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
 static void io_worker_handle_work(struct io_worker *worker)
 	__releases(wqe->lock)
 {
-	struct io_wq_work *work, *old_work;
+	struct io_wq_work *work, *old_work = NULL, *put_work = NULL;
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
 
@@ -424,6 +427,8 @@ static void io_worker_handle_work(struct io_worker *worker)
 			wqe->flags |= IO_WQE_FLAG_STALLED;
 
 		spin_unlock_irq(&wqe->lock);
+		if (put_work && wq->put_work)
+			wq->put_work(old_work);
 		if (!work)
 			break;
 next:
@@ -444,6 +449,11 @@ static void io_worker_handle_work(struct io_worker *worker)
 		if (worker->mm)
 			work->flags |= IO_WQ_WORK_HAS_MM;
 
+		if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) {
+			put_work = work;
+			wq->get_work(work);
+		}
+
 		old_work = work;
 		work->func(&work);
 
@@ -455,6 +465,12 @@ static void io_worker_handle_work(struct io_worker *worker)
 		}
 		if (work && work != old_work) {
 			spin_unlock_irq(&wqe->lock);
+
+			if (put_work && wq->put_work) {
+				wq->put_work(put_work);
+				put_work = NULL;
+			}
+
 			/* dependent work not hashed */
 			hash = -1U;
 			goto next;
@@ -950,13 +966,15 @@ void io_wq_flush(struct io_wq *wq)
 
 		init_completion(&data.done);
 		INIT_IO_WORK(&data.work, io_wq_flush_func);
+		data.work.flags |= IO_WQ_WORK_INTERNAL;
 		io_wqe_enqueue(wqe, &data.work);
 		wait_for_completion(&data.done);
 	}
 }
 
 struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
-			   struct user_struct *user)
+			   struct user_struct *user, get_work_fn *get_work,
+			   put_work_fn *put_work)
 {
 	int ret = -ENOMEM, i, node;
 	struct io_wq *wq;
@@ -972,6 +990,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	wq->get_work = get_work;
+	wq->put_work = put_work;
+
 	/* caller must already hold a reference to this */
 	wq->user = user;
 
diff --git a/fs/io-wq.h b/fs/io-wq.h
index cc50754d028c..4b29f922f80c 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -10,6 +10,7 @@ enum {
 	IO_WQ_WORK_NEEDS_USER	= 8,
 	IO_WQ_WORK_NEEDS_FILES	= 16,
 	IO_WQ_WORK_UNBOUND	= 32,
+	IO_WQ_WORK_INTERNAL	= 64,
 
 	IO_WQ_HASH_SHIFT	= 24,	/* upper 8 bits are used for hash key */
 };
@@ -34,8 +35,12 @@ struct io_wq_work {
 		(work)->files = NULL;			\
 	} while (0)					\
 
+typedef void (get_work_fn)(struct io_wq_work *);
+typedef void (put_work_fn)(struct io_wq_work *);
+
 struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
-				struct user_struct *user);
+				struct user_struct *user,
+				get_work_fn *get_work, put_work_fn *put_work);
 void io_wq_destroy(struct io_wq *wq);
 
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 99822bf89924..e1a3b8b667e0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3822,6 +3822,20 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 	return done ? done : err;
 }
 
+static void io_put_work(struct io_wq_work *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+	io_put_req(req);
+}
+
+static void io_get_work(struct io_wq_work *work)
+{
+	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+	refcount_inc(&req->refs);
+}
+
 static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			       struct io_uring_params *p)
 {
@@ -3871,7 +3885,8 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 
 	/* Do QD, or 4 * CPUS, whatever is smallest */
 	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-	ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user);
+	ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user,
+					io_get_work, io_put_work);
 	if (IS_ERR(ctx->io_wq)) {
 		ret = PTR_ERR(ctx->io_wq);
 		ctx->io_wq = NULL;

From 36c2f9223e84c1aa84bfba90cb2e74b517c92a54 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 13 Nov 2019 09:43:34 -0700
Subject: [PATCH 58/64] io-wq: ensure we have a stable view of ->cur_work for
 cancellations

worker->cur_work is currently protected by the lock of the wqe that the
worker belongs to. When we send a signal to a worker, we need a stable
view of ->cur_work, so we need to hold that lock. But this doesn't work
so well, since we have the opposite order potentially on queueing work.
If POLL_ADD is used with a signalfd, then io_poll_wake() is called with
the signal lock, and that sometimes needs to insert work items.

Add a specific worker lock that protects the current work item. Then we
can guarantee that the task we're sending a signal is currently
processing the exact work we think it is.

Reported-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 26d81540c1fc..4031b75541be 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -49,7 +49,9 @@ struct io_worker {
 	struct task_struct *task;
 	wait_queue_head_t wait;
 	struct io_wqe *wqe;
+
 	struct io_wq_work *cur_work;
+	spinlock_t lock;
 
 	struct rcu_head rcu;
 	struct mm_struct *mm;
@@ -323,7 +325,6 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
 		hlist_nulls_add_head_rcu(&worker->nulls_node,
 						&wqe->busy_list.head);
 	}
-	worker->cur_work = work;
 
 	/*
 	 * If worker is moving from bound to unbound (or vice versa), then
@@ -402,17 +403,6 @@ static void io_worker_handle_work(struct io_worker *worker)
 	do {
 		unsigned hash = -1U;
 
-		/*
-		 * Signals are either sent to cancel specific work, or to just
-		 * cancel all work items. For the former, ->cur_work must
-		 * match. ->cur_work is NULL at this point, since we haven't
-		 * assigned any work, so it's safe to flush signals for that
-		 * case. For the latter case of cancelling all work, the caller
-		 * wil have set IO_WQ_BIT_CANCEL.
-		 */
-		if (signal_pending(current))
-			flush_signals(current);
-
 		/*
 		 * If we got some work, mark us as busy. If we didn't, but
 		 * the list isn't empty, it means we stalled on hashed work.
@@ -432,6 +422,14 @@ static void io_worker_handle_work(struct io_worker *worker)
 		if (!work)
 			break;
 next:
+		/* flush any pending signals before assigning new work */
+		if (signal_pending(current))
+			flush_signals(current);
+
+		spin_lock_irq(&worker->lock);
+		worker->cur_work = work;
+		spin_unlock_irq(&worker->lock);
+
 		if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
 		    current->files != work->files) {
 			task_lock(current);
@@ -457,8 +455,12 @@ static void io_worker_handle_work(struct io_worker *worker)
 		old_work = work;
 		work->func(&work);
 
-		spin_lock_irq(&wqe->lock);
+		spin_lock_irq(&worker->lock);
 		worker->cur_work = NULL;
+		spin_unlock_irq(&worker->lock);
+
+		spin_lock_irq(&wqe->lock);
+
 		if (hash != -1U) {
 			wqe->hash_map &= ~BIT_ULL(hash);
 			wqe->flags &= ~IO_WQE_FLAG_STALLED;
@@ -577,6 +579,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	worker->nulls_node.pprev = NULL;
 	init_waitqueue_head(&worker->wait);
 	worker->wqe = wqe;
+	spin_lock_init(&worker->lock);
 
 	worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
 				"io_wqe_worker-%d/%d", index, wqe->node);
@@ -783,21 +786,20 @@ struct io_cb_cancel_data {
 static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
 {
 	struct io_cb_cancel_data *data = cancel_data;
-	struct io_wqe *wqe = data->wqe;
 	unsigned long flags;
 	bool ret = false;
 
 	/*
 	 * Hold the lock to avoid ->cur_work going out of scope, caller
-	 * may deference the passed in work.
+	 * may dereference the passed in work.
 	 */
-	spin_lock_irqsave(&wqe->lock, flags);
+	spin_lock_irqsave(&worker->lock, flags);
 	if (worker->cur_work &&
 	    data->cancel(worker->cur_work, data->caller_data)) {
 		send_sig(SIGINT, worker->task, 1);
 		ret = true;
 	}
-	spin_unlock_irqrestore(&wqe->lock, flags);
+	spin_unlock_irqrestore(&worker->lock, flags);
 
 	return ret;
 }
@@ -864,13 +866,20 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 {
 	struct io_wq_work *work = data;
+	unsigned long flags;
+	bool ret = false;
 
+	if (worker->cur_work != work)
+		return false;
+
+	spin_lock_irqsave(&worker->lock, flags);
 	if (worker->cur_work == work) {
 		send_sig(SIGINT, worker->task, 1);
-		return true;
+		ret = true;
 	}
+	spin_unlock_irqrestore(&worker->lock, flags);
 
-	return false;
+	return ret;
 }
 
 static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,

From e61df66c69b11bc050d233dc95714a6339192c28 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 13 Nov 2019 13:54:49 -0700
Subject: [PATCH 59/64] io-wq: ensure free/busy list browsing see all items

We have two lists for workers in io-wq, a busy and a free list. For
certain operations we want to browse all workers, and we currently do
that by browsing the two separate lists. But since these lists are RCU
protected, we can potentially miss workers if they move between the two
lists while we're browsing them.

Add a third list, all_list, that simply holds all workers. A worker is
added to that list when it starts, and removed when it exits. This makes
the worker iteration cleaner, too.

Reported-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 41 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 4031b75541be..fcb6c74209da 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -46,6 +46,7 @@ struct io_worker {
 	refcount_t ref;
 	unsigned flags;
 	struct hlist_nulls_node nulls_node;
+	struct list_head all_list;
 	struct task_struct *task;
 	wait_queue_head_t wait;
 	struct io_wqe *wqe;
@@ -96,6 +97,7 @@ struct io_wqe {
 
 	struct io_wq_nulls_list free_list;
 	struct io_wq_nulls_list busy_list;
+	struct list_head all_list;
 
 	struct io_wq *wq;
 };
@@ -212,6 +214,7 @@ static void io_worker_exit(struct io_worker *worker)
 
 	spin_lock_irq(&wqe->lock);
 	hlist_nulls_del_rcu(&worker->nulls_node);
+	list_del_rcu(&worker->all_list);
 	if (__io_worker_unuse(wqe, worker)) {
 		__release(&wqe->lock);
 		spin_lock_irq(&wqe->lock);
@@ -590,6 +593,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 
 	spin_lock_irq(&wqe->lock);
 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list.head);
+	list_add_tail_rcu(&worker->all_list, &wqe->all_list);
 	worker->flags |= IO_WORKER_F_FREE;
 	if (index == IO_WQ_ACCT_BOUND)
 		worker->flags |= IO_WORKER_F_BOUND;
@@ -733,16 +737,13 @@ static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
  * worker that isn't exiting
  */
 static bool io_wq_for_each_worker(struct io_wqe *wqe,
-				  struct io_wq_nulls_list *list,
 				  bool (*func)(struct io_worker *, void *),
 				  void *data)
 {
-	struct hlist_nulls_node *n;
 	struct io_worker *worker;
 	bool ret = false;
 
-restart:
-	hlist_nulls_for_each_entry_rcu(worker, n, &list->head, nulls_node) {
+	list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
 		if (io_worker_get(worker)) {
 			ret = func(worker, data);
 			io_worker_release(worker);
@@ -750,8 +751,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
 				break;
 		}
 	}
-	if (!ret && get_nulls_value(n) != list->nulls)
-		goto restart;
+
 	return ret;
 }
 
@@ -769,10 +769,7 @@ void io_wq_cancel_all(struct io_wq *wq)
 	for (i = 0; i < wq->nr_wqes; i++) {
 		struct io_wqe *wqe = wq->wqes[i];
 
-		io_wq_for_each_worker(wqe, &wqe->busy_list,
-					io_wqe_worker_send_sig, NULL);
-		io_wq_for_each_worker(wqe, &wqe->free_list,
-					io_wqe_worker_send_sig, NULL);
+		io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
 	}
 	rcu_read_unlock();
 }
@@ -834,14 +831,7 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
 	}
 
 	rcu_read_lock();
-	found = io_wq_for_each_worker(wqe, &wqe->free_list, io_work_cancel,
-					&data);
-	if (found)
-		goto done;
-
-	found = io_wq_for_each_worker(wqe, &wqe->busy_list, io_work_cancel,
-					&data);
-done:
+	found = io_wq_for_each_worker(wqe, io_work_cancel, &data);
 	rcu_read_unlock();
 	return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
 }
@@ -919,14 +909,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 	 * completion will run normally in this case.
 	 */
 	rcu_read_lock();
-	found = io_wq_for_each_worker(wqe, &wqe->free_list, io_wq_worker_cancel,
-					cwork);
-	if (found)
-		goto done;
-
-	found = io_wq_for_each_worker(wqe, &wqe->busy_list, io_wq_worker_cancel,
-					cwork);
-done:
+	found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork);
 	rcu_read_unlock();
 	return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
 }
@@ -1030,6 +1013,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
 		wqe->free_list.nulls = 0;
 		INIT_HLIST_NULLS_HEAD(&wqe->busy_list.head, 1);
 		wqe->busy_list.nulls = 1;
+		INIT_LIST_HEAD(&wqe->all_list);
 
 		i++;
 	}
@@ -1077,10 +1061,7 @@ void io_wq_destroy(struct io_wq *wq)
 
 		if (!wqe)
 			continue;
-		io_wq_for_each_worker(wqe, &wqe->free_list, io_wq_worker_wake,
-						NULL);
-		io_wq_for_each_worker(wqe, &wqe->busy_list, io_wq_worker_wake,
-						NULL);
+		io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
 	}
 	rcu_read_unlock();
 

From 2f6d9b9d6357ede64a29437676884ee263039910 Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Wed, 13 Nov 2019 18:06:24 +0800
Subject: [PATCH 60/64] io_uring: clean up io_uring_cancel_files()

We don't use the return value anymore, drop it. Also drop the
unecessary double cancel_req value check.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e1a3b8b667e0..297b9e80dc5c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4303,7 +4303,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 	DEFINE_WAIT(wait);
 
 	while (!list_empty_careful(&ctx->inflight_list)) {
-		enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
 		struct io_kiocb *cancel_req = NULL;
 
 		spin_lock_irq(&ctx->inflight_lock);
@@ -4321,14 +4320,12 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 						TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(&ctx->inflight_lock);
 
-		if (cancel_req) {
-			ret = io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
-			io_put_req(cancel_req);
-		}
-
 		/* We need to keep going until we don't find a matching req */
 		if (!cancel_req)
 			break;
+
+		io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
+		io_put_req(cancel_req);
 		schedule();
 	}
 	finish_wait(&ctx->inflight_wait, &wait);

From 9d858b21483981db9c0cb4b184d4cdeb4bc525c2 Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Wed, 13 Nov 2019 18:06:25 +0800
Subject: [PATCH 61/64] io_uring: introduce req_need_defer()

Makes the code easier to read.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 297b9e80dc5c..9500780bcaea 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -448,7 +448,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	return NULL;
 }
 
-static inline bool __io_sequence_defer(struct io_kiocb *req)
+static inline bool __req_need_defer(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -456,12 +456,12 @@ static inline bool __io_sequence_defer(struct io_kiocb *req)
 					+ atomic_read(&ctx->cached_cq_overflow);
 }
 
-static inline bool io_sequence_defer(struct io_kiocb *req)
+static inline bool req_need_defer(struct io_kiocb *req)
 {
-	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
-		return false;
+	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
+		return __req_need_defer(req);
 
-	return __io_sequence_defer(req);
+	return false;
 }
 
 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -469,7 +469,7 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 	struct io_kiocb *req;
 
 	req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
-	if (req && !io_sequence_defer(req)) {
+	if (req && !req_need_defer(req)) {
 		list_del_init(&req->list);
 		return req;
 	}
@@ -482,7 +482,7 @@ static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 	struct io_kiocb *req;
 
 	req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
-	if (req && !__io_sequence_defer(req)) {
+	if (req && !__req_need_defer(req)) {
 		list_del_init(&req->list);
 		return req;
 	}
@@ -2436,7 +2436,8 @@ static int io_req_defer(struct io_kiocb *req)
 	struct io_uring_sqe *sqe_copy;
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (!io_sequence_defer(req) && list_empty(&ctx->defer_list))
+	/* Still need defer if there is pending req in defer list. */
+	if (!req_need_defer(req) && list_empty(&ctx->defer_list))
 		return 0;
 
 	sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
@@ -2444,7 +2445,7 @@ static int io_req_defer(struct io_kiocb *req)
 		return -EAGAIN;
 
 	spin_lock_irq(&ctx->completion_lock);
-	if (!io_sequence_defer(req) && list_empty(&ctx->defer_list)) {
+	if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
 		spin_unlock_irq(&ctx->completion_lock);
 		kfree(sqe_copy);
 		return 0;

From a320e9fa1e2680116d165b9369dfa41d7cc1e1d1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 14 Nov 2019 00:11:01 +0300
Subject: [PATCH 62/64] io_uring: Fix getting file for non-fd opcodes

For timeout requests and bunch of others io_uring tries to grab a file
with specified fd, which is usually stdin/fd=0.
Update io_op_needs_file()

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9500780bcaea..55f8b1d378df 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2599,6 +2599,10 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe)
 	switch (op) {
 	case IORING_OP_NOP:
 	case IORING_OP_POLL_REMOVE:
+	case IORING_OP_TIMEOUT:
+	case IORING_OP_TIMEOUT_REMOVE:
+	case IORING_OP_ASYNC_CANCEL:
+	case IORING_OP_LINK_TIMEOUT:
 		return false;
 	default:
 		return true;

From 021d1cdda3875bf35edac9133335f622d7910abc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Nov 2019 08:00:41 -0700
Subject: [PATCH 63/64] io-wq: remove now redundant struct io_wq_nulls_list

Since we don't iterate these lists anymore after commit:

e61df66c69b1 ("io-wq: ensure free/busy list browsing see all items")

we don't need to retain the nulls value we use for them. That means it's
pretty pointless to wrap the hlist_nulls_head in a structure, so get rid
of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index fcb6c74209da..9174007ce107 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -59,11 +59,6 @@ struct io_worker {
 	struct files_struct *restore_files;
 };
 
-struct io_wq_nulls_list {
-	struct hlist_nulls_head head;
-	unsigned long nulls;
-};
-
 #if BITS_PER_LONG == 64
 #define IO_WQ_HASH_ORDER	6
 #else
@@ -95,8 +90,8 @@ struct io_wqe {
 	int node;
 	struct io_wqe_acct acct[2];
 
-	struct io_wq_nulls_list free_list;
-	struct io_wq_nulls_list busy_list;
+	struct hlist_nulls_head free_list;
+	struct hlist_nulls_head busy_list;
 	struct list_head all_list;
 
 	struct io_wq *wq;
@@ -249,7 +244,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
 	struct hlist_nulls_node *n;
 	struct io_worker *worker;
 
-	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list.head));
+	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
 	if (is_a_nulls(n))
 		return false;
 
@@ -325,8 +320,7 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
 	if (worker->flags & IO_WORKER_F_FREE) {
 		worker->flags &= ~IO_WORKER_F_FREE;
 		hlist_nulls_del_init_rcu(&worker->nulls_node);
-		hlist_nulls_add_head_rcu(&worker->nulls_node,
-						&wqe->busy_list.head);
+		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list);
 	}
 
 	/*
@@ -365,8 +359,7 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
 	if (!(worker->flags & IO_WORKER_F_FREE)) {
 		worker->flags |= IO_WORKER_F_FREE;
 		hlist_nulls_del_init_rcu(&worker->nulls_node);
-		hlist_nulls_add_head_rcu(&worker->nulls_node,
-						&wqe->free_list.head);
+		hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	}
 
 	return __io_worker_unuse(wqe, worker);
@@ -592,7 +585,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	}
 
 	spin_lock_irq(&wqe->lock);
-	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list.head);
+	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	list_add_tail_rcu(&worker->all_list, &wqe->all_list);
 	worker->flags |= IO_WORKER_F_FREE;
 	if (index == IO_WQ_ACCT_BOUND)
@@ -617,7 +610,7 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
 	if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers)
 		return true;
 	/* if we have available workers or no work, no need */
-	if (!hlist_nulls_empty(&wqe->free_list.head) || !io_wqe_run_queue(wqe))
+	if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
 		return false;
 	return acct->nr_workers < acct->max_workers;
 }
@@ -665,7 +658,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
 		return true;
 
 	rcu_read_lock();
-	free_worker = !hlist_nulls_empty(&wqe->free_list.head);
+	free_worker = !hlist_nulls_empty(&wqe->free_list);
 	rcu_read_unlock();
 	if (free_worker)
 		return true;
@@ -1009,10 +1002,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
 		wqe->wq = wq;
 		spin_lock_init(&wqe->lock);
 		INIT_LIST_HEAD(&wqe->work_list);
-		INIT_HLIST_NULLS_HEAD(&wqe->free_list.head, 0);
-		wqe->free_list.nulls = 0;
-		INIT_HLIST_NULLS_HEAD(&wqe->busy_list.head, 1);
-		wqe->busy_list.nulls = 1;
+		INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
+		INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1);
 		INIT_LIST_HEAD(&wqe->all_list);
 
 		i++;

From eac406c61cd0ec8fe7970ca46ddf23e40a86b579 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Nov 2019 12:09:58 -0700
Subject: [PATCH 64/64] io_uring: make POLL_ADD/POLL_REMOVE scale better

One of the obvious use cases for these commands is networking, where
it's not uncommon to have tons of sockets open and polled for. The
current implementation uses a list for insertion and lookup, which works
fine for file based use cases where the count is usually low, it breaks
down somewhat for higher number of files / sockets. A test case with
30k sockets being polled for and cancelled takes:

real    0m6.968s
user    0m0.002s
sys     0m6.936s

with the patch it takes:

real    0m0.233s
user    0m0.010s
sys     0m0.176s

If you go to 50k sockets, it gets even more abysmal with the current
code:

real    0m40.602s
user    0m0.010s
sys     0m40.555s

with the patch it takes:

real    0m0.398s
user    0m0.000s
sys     0m0.341s

Change is pretty straight forward, just replace the cancel_list with
a red/black tree instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 69 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 55f8b1d378df..5ad652fa24b8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -271,7 +271,7 @@ struct io_ring_ctx {
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
 		struct list_head	poll_list;
-		struct list_head	cancel_list;
+		struct rb_root		cancel_tree;
 
 		spinlock_t		inflight_lock;
 		struct list_head	inflight_list;
@@ -323,7 +323,10 @@ struct io_kiocb {
 	struct sqe_submit	submit;
 
 	struct io_ring_ctx	*ctx;
-	struct list_head	list;
+	union {
+		struct list_head	list;
+		struct rb_node		rb_node;
+	};
 	struct list_head	link_list;
 	unsigned int		flags;
 	refcount_t		refs;
@@ -433,7 +436,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
 	INIT_LIST_HEAD(&ctx->poll_list);
-	INIT_LIST_HEAD(&ctx->cancel_list);
+	ctx->cancel_tree = RB_ROOT;
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	init_waitqueue_head(&ctx->inflight_wait);
@@ -1934,6 +1937,14 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 #endif
 }
 
+static inline void io_poll_remove_req(struct io_kiocb *req)
+{
+	if (!RB_EMPTY_NODE(&req->rb_node)) {
+		rb_erase(&req->rb_node, &req->ctx->cancel_tree);
+		RB_CLEAR_NODE(&req->rb_node);
+	}
+}
+
 static void io_poll_remove_one(struct io_kiocb *req)
 {
 	struct io_poll_iocb *poll = &req->poll;
@@ -1945,17 +1956,17 @@ static void io_poll_remove_one(struct io_kiocb *req)
 		io_queue_async_work(req);
 	}
 	spin_unlock(&poll->head->lock);
-
-	list_del_init(&req->list);
+	io_poll_remove_req(req);
 }
 
 static void io_poll_remove_all(struct io_ring_ctx *ctx)
 {
+	struct rb_node *node;
 	struct io_kiocb *req;
 
 	spin_lock_irq(&ctx->completion_lock);
-	while (!list_empty(&ctx->cancel_list)) {
-		req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+	while ((node = rb_first(&ctx->cancel_tree)) != NULL) {
+		req = rb_entry(node, struct io_kiocb, rb_node);
 		io_poll_remove_one(req);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
@@ -1963,13 +1974,21 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
 
 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
 {
+	struct rb_node *p, *parent = NULL;
 	struct io_kiocb *req;
 
-	list_for_each_entry(req, &ctx->cancel_list, list) {
-		if (req->user_data != sqe_addr)
-			continue;
-		io_poll_remove_one(req);
-		return 0;
+	p = ctx->cancel_tree.rb_node;
+	while (p) {
+		parent = p;
+		req = rb_entry(parent, struct io_kiocb, rb_node);
+		if (sqe_addr < req->user_data) {
+			p = p->rb_left;
+		} else if (sqe_addr > req->user_data) {
+			p = p->rb_right;
+		} else {
+			io_poll_remove_one(req);
+			return 0;
+		}
 	}
 
 	return -ENOENT;
@@ -2039,7 +2058,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 		spin_unlock_irq(&ctx->completion_lock);
 		return;
 	}
-	list_del_init(&req->list);
+	io_poll_remove_req(req);
 	io_poll_complete(req, mask);
 	spin_unlock_irq(&ctx->completion_lock);
 
@@ -2073,7 +2092,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	 * for finalizing the request, mark us as having grabbed that already.
 	 */
 	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
-		list_del(&req->list);
+		io_poll_remove_req(req);
 		io_poll_complete(req, mask);
 		req->flags |= REQ_F_COMP_LOCKED;
 		io_put_req(req);
@@ -2108,6 +2127,25 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 	add_wait_queue(head, &pt->req->poll.wait);
 }
 
+static void io_poll_req_insert(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct rb_node **p = &ctx->cancel_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct io_kiocb *tmp;
+
+	while (*p) {
+		parent = *p;
+		tmp = rb_entry(parent, struct io_kiocb, rb_node);
+		if (req->user_data < tmp->user_data)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&req->rb_node, parent, p);
+	rb_insert_color(&req->rb_node, &ctx->cancel_tree);
+}
+
 static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		       struct io_kiocb **nxt)
 {
@@ -2129,6 +2167,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	INIT_IO_WORK(&req->work, io_poll_complete_work);
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+	RB_CLEAR_NODE(&req->rb_node);
 
 	poll->head = NULL;
 	poll->done = false;
@@ -2161,7 +2200,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		else if (cancel)
 			WRITE_ONCE(poll->canceled, true);
 		else if (!poll->done) /* actually waiting for an event */
-			list_add_tail(&req->list, &ctx->cancel_list);
+			io_poll_req_insert(req);
 		spin_unlock(&poll->head->lock);
 	}
 	if (mask) { /* no async, we'd stolen it */