From 3671163beb633fbe3297b8e30369b640ce4bd690 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Nov 2022 10:59:40 +0000
Subject: io_uring: move kbuf put out of generic tw complete

There are multiple users of io_req_task_complete() including zc
notifications, but only read requests use selected buffers. As we
already have an rw specific tw function, move io_put_kbuf() in there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/94374c7649aaefc3a17808dc4701f25ccd457e25.1667557923.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8840cf3e20f2..bdb7e15f1c48 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1476,12 +1476,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 
 void io_req_task_complete(struct io_kiocb *req, bool *locked)
 {
-	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
-		unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
-
-		req->cqe.flags |= io_put_kbuf(req, issue_flags);
-	}
-
 	if (*locked)
 		io_req_complete_defer(req);
 	else
-- 
cgit v1.2.3


From ef67fcb41de6d3d5bbb16aaa66d4c706c4cacf54 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Mon, 7 Nov 2022 04:33:49 -0800
Subject: io_uring: do not always force run task_work in io_uring_register

Running task work when not needed can unnecessarily delay
operations. Specifically IORING_SETUP_DEFER_TASKRUN tries to avoid running
task work until the user requests it. Therefore do not run it in
io_uring_register any more.

The one catch is that io_rsrc_ref_quiesce expects it to have run in order
to process all outstanding references, and so reorder it's loop to do this.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221107123349.4106213-1-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 --
 io_uring/rsrc.c     | 7 ++++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index bdb7e15f1c48..cf68d16255a0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -4056,8 +4056,6 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 
 	ctx = f.file->private_data;
 
-	io_run_task_work_ctx(ctx);
-
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 55d4ab96fb92..187f1c83e779 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -321,6 +321,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		if (atomic_dec_and_test(&data->refs))
 			break;
 		mutex_unlock(&ctx->uring_lock);
+
+		ret = io_run_task_work_sig(ctx);
+		if (ret < 0)
+			goto reinit;
+
 		flush_delayed_work(&ctx->rsrc_put_work);
 		ret = wait_for_completion_interruptible(&data->done);
 		if (!ret) {
@@ -336,12 +341,12 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 			}
 		}
 
+reinit:
 		atomic_inc(&data->refs);
 		/* wait for all works potentially completing data->done */
 		flush_delayed_work(&ctx->rsrc_put_work);
 		reinit_completion(&data->done);
 
-		ret = io_run_task_work_sig(ctx);
 		mutex_lock(&ctx->uring_lock);
 	} while (ret >= 0);
 	data->quiesce = false;
-- 
cgit v1.2.3


From 23a6c9ac4dbd7cccf5b909e78aa84192b65f2833 Mon Sep 17 00:00:00 2001
From: Lin Ma <linma@zju.edu.cn>
Date: Thu, 10 Nov 2022 20:21:03 +0800
Subject: io_uring: update outdated comment of callbacks

Previous commit ebc11b6c6b87 ("io_uring: clean io-wq callbacks") rename
io_free_work() into io_wq_free_work() for consistency. This patch also
updates relevant comment to avoid misunderstanding.

Fixes: ebc11b6c6b87 ("io_uring: clean io-wq callbacks")
Signed-off-by: Lin Ma <linma@zju.edu.cn>
Link: https://lore.kernel.org/r/20221110122103.20120-1-linma@zju.edu.cn
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cf68d16255a0..c770eed4d717 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1781,7 +1781,7 @@ void io_wq_submit_work(struct io_wq_work *work)
 	bool needs_poll = false;
 	int ret = 0, err = -ECANCELED;
 
-	/* one will be dropped by ->io_free_work() after returning to io-wq */
+	/* one will be dropped by ->io_wq_free_work() after returning to io-wq */
 	if (!(req->flags & REQ_F_REFCOUNT))
 		__io_req_set_refcount(req, 2);
 	else
-- 
cgit v1.2.3


From e52d2e583e4ad1d5d0b804d79c2b8752eb0e5ceb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Nov 2022 16:54:08 +0000
Subject: io_uring: inline io_req_task_work_add()

__io_req_task_work_add() is huge but marked inline, that makes compilers
to generate lots of garbage. Inline the wrapper caller
io_req_task_work_add() instead.

before and after:
   text    data     bss     dec     hex filename
  47347   16248       8   63603    f873 io_uring/io_uring.o
   text    data     bss     dec     hex filename
  45303   16248       8   61559    f077 io_uring/io_uring.o

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/26dc8c28ca0160e3269ef3e55c5a8b917c4d4450.1668162751.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 7 +------
 io_uring/io_uring.h | 7 ++++++-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c770eed4d717..8f452dfb4f1c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1127,7 +1127,7 @@ static void io_req_local_work_add(struct io_kiocb *req)
 	__io_cqring_wake(ctx);
 }
 
-static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
+void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1159,11 +1159,6 @@ static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local
 	}
 }
 
-void io_req_task_work_add(struct io_kiocb *req)
-{
-	__io_req_task_work_add(req, true);
-}
-
 static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 {
 	struct llist_node *node;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index cef5ff924e63..38d9e149d2db 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -50,9 +50,9 @@ static inline bool io_req_ffs_set(struct io_kiocb *req)
 	return req->flags & REQ_F_FIXED_FILE;
 }
 
+void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
-void io_req_task_work_add(struct io_kiocb *req);
 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_queue(struct io_kiocb *req);
 void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
@@ -82,6 +82,11 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
 
+static inline void io_req_task_work_add(struct io_kiocb *req)
+{
+	__io_req_task_work_add(req, true);
+}
+
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
-- 
cgit v1.2.3


From d75936062049522172a107c994242b76c89777f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 11 Nov 2022 16:54:09 +0000
Subject: io_uring: split tw fallback into a function

When the target process is dying and so task_work_add() is not allowed
we push all task_work item to the fallback workqueue. Move the part
responsible for moving tw items out of __io_req_task_work_add() into
a separate function. Makes it a bit cleaner and gives the compiler a bit
of extra info.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e503dab9d7af95470ca6b214c6de17715ae4e748.1668162751.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8f452dfb4f1c..9925ac08c398 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1105,6 +1105,20 @@ void tctx_task_work(struct callback_head *cb)
 	trace_io_uring_task_work_run(tctx, count, loops);
 }
 
+static __cold void io_fallback_tw(struct io_uring_task *tctx)
+{
+	struct llist_node *node = llist_del_all(&tctx->task_list);
+	struct io_kiocb *req;
+
+	while (node) {
+		req = container_of(node, struct io_kiocb, io_task_work.node);
+		node = node->next;
+		if (llist_add(&req->io_task_work.node,
+			      &req->ctx->fallback_llist))
+			schedule_delayed_work(&req->ctx->fallback_work, 1);
+	}
+}
+
 static void io_req_local_work_add(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1131,7 +1145,6 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct llist_node *node;
 
 	if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
 		io_req_local_work_add(req);
@@ -1148,15 +1161,7 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-	node = llist_del_all(&tctx->task_list);
-
-	while (node) {
-		req = container_of(node, struct io_kiocb, io_task_work.node);
-		node = node->next;
-		if (llist_add(&req->io_task_work.node,
-			      &req->ctx->fallback_llist))
-			schedule_delayed_work(&req->ctx->fallback_work, 1);
-	}
+	io_fallback_tw(tctx);
 }
 
 static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
-- 
cgit v1.2.3


From f9d567c75ec216447f36da6e855500023504fa04 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 17 Nov 2022 18:41:06 +0000
Subject: io_uring: inline __io_req_complete_post()

There is only one user of __io_req_complete_post(), inline it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ef4c9059950a3da5cf68df00f977f1fd13bd9306.1668597569.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 +++--------
 io_uring/io_uring.h |  1 -
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9925ac08c398..1299f9c8567a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -854,19 +854,14 @@ static void __io_req_complete_put(struct io_kiocb *req)
 	}
 }
 
-void __io_req_complete_post(struct io_kiocb *req)
-{
-	if (!(req->flags & REQ_F_CQE_SKIP))
-		__io_fill_cqe_req(req->ctx, req);
-	__io_req_complete_put(req);
-}
-
 void io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_cq_lock(ctx);
-	__io_req_complete_post(req);
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		__io_fill_cqe_req(ctx, req);
+	__io_req_complete_put(req);
 	io_cq_unlock_post(ctx);
 }
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 38d9e149d2db..69fbd27c7577 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -32,7 +32,6 @@ int io_run_local_work(struct io_ring_ctx *ctx);
 void io_req_complete_failed(struct io_kiocb *req, s32 res);
 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
 void io_req_complete_post(struct io_kiocb *req);
-void __io_req_complete_post(struct io_kiocb *req);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-- 
cgit v1.2.3


From 4464853277d0ccdb9914608dd1332f0fa2f9846f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 20 Nov 2022 10:18:45 -0700
Subject: io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and wakeups

Pass in EPOLL_URING_WAKE when signaling eventfd or doing poll related
wakups, so that we can check for a circular event dependency between
eventfd and epoll. If this flag is set when our wakeup handlers are
called, then we know we have a dependency that needs to terminate
multishot requests.

eventfd and epoll are the only such possible dependencies.

Cc: stable@vger.kernel.org # 6.0
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  4 ++--
 io_uring/io_uring.h | 15 +++++++++++----
 io_uring/poll.c     |  8 ++++++++
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1299f9c8567a..762ecab801f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -495,7 +495,7 @@ static void io_eventfd_ops(struct rcu_head *rcu)
 	int ops = atomic_xchg(&ev_fd->ops, 0);
 
 	if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
-		eventfd_signal(ev_fd->cq_ev_fd, 1);
+		eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
 
 	/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
 	 * ordering in a race but if references are 0 we know we have to free
@@ -531,7 +531,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
 		goto out;
 
 	if (likely(eventfd_signal_allowed())) {
-		eventfd_signal(ev_fd->cq_ev_fd, 1);
+		eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
 	} else {
 		atomic_inc(&ev_fd->refs);
 		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 69fbd27c7577..83013ee584d6 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -4,6 +4,7 @@
 #include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <linux/io_uring_types.h>
+#include <uapi/linux/eventpoll.h>
 #include "io-wq.h"
 #include "slist.h"
 #include "filetable.h"
@@ -211,12 +212,18 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
 {
 	/*
-	 * wake_up_all() may seem excessive, but io_wake_function() and
-	 * io_should_wake() handle the termination of the loop and only
-	 * wake as many waiters as we need to.
+	 * Trigger waitqueue handler on all waiters on our waitqueue. This
+	 * won't necessarily wake up all the tasks, io_should_wake() will make
+	 * that decision.
+	 *
+	 * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
+	 * set in the mask so that if we recurse back into our own poll
+	 * waitqueue handlers, we know we have a dependency between eventfd or
+	 * epoll and should terminate multishot poll at that point.
 	 */
 	if (waitqueue_active(&ctx->cq_wait))
-		wake_up_all(&ctx->cq_wait);
+		__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
+				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
 }
 
 static inline void io_cqring_wake(struct io_ring_ctx *ctx)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8fb8e781c02d..22c9b2e0944a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -389,6 +389,14 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		return 0;
 
 	if (io_poll_get_ownership(req)) {
+		/*
+		 * If we trigger a multishot poll off our own wakeup path,
+		 * disable multishot as there is a circular dependency between
+		 * CQ posting and triggering the event.
+		 */
+		if (mask & EPOLL_URING_WAKE)
+			poll->events |= EPOLLONESHOT;
+
 		/* optional, saves extra locking for removal in tw handler */
 		if (mask && poll->events & EPOLLONESHOT) {
 			list_del_init(&poll->wait.entry);
-- 
cgit v1.2.3


From 6c16fe3c16bdc420719768f7ea97b82bd6303eec Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 21 Nov 2022 07:51:15 -0700
Subject: io_uring: kill io_cqring_ev_posted() and __io_cq_unlock_post()

__io_cq_unlock_post() is identical to io_cq_unlock_post(), and
io_cqring_ev_posted() has a single caller so migth as well just inline
it there.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 762ecab801f2..2260fb7aa7f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -581,23 +581,14 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_flush_signal(ctx);
 }
 
-static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
-	io_commit_cqring_flush(ctx);
-	io_cqring_wake(ctx);
-}
-
-static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
+void io_cq_unlock_post(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
 {
 	io_commit_cqring(ctx);
 	spin_unlock(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
-}
 
-void io_cq_unlock_post(struct io_ring_ctx *ctx)
-{
-	__io_cq_unlock_post(ctx);
+	io_commit_cqring_flush(ctx);
+	io_cqring_wake(ctx);
 }
 
 /* Returns true if there are no backlogged entries after the flush */
@@ -1346,7 +1337,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		if (!(req->flags & REQ_F_CQE_SKIP))
 			__io_fill_cqe_req(ctx, req);
 	}
-	__io_cq_unlock_post(ctx);
+	io_cq_unlock_post(ctx);
 
 	io_free_batch_list(ctx, state->compl_reqs.first);
 	INIT_WQ_LIST(&state->compl_reqs);
-- 
cgit v1.2.3


From e276ae344a770f91912a81c6a338d92efd319be2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:37 +0000
Subject: io_uring: hold locks for io_req_complete_failed

A preparation patch, make sure we always hold uring_lock around
io_req_complete_failed(). The only place deviating from the rule
is io_cancel_defer_files(), queue a tw instead.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/70760344eadaecf2939287084b9d4ba5c05a6984.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2260fb7aa7f2..4d16f3b1ee11 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -862,9 +862,12 @@ inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
 }
 
 void io_req_complete_failed(struct io_kiocb *req, s32 res)
+	__must_hold(&ctx->uring_lock)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 
+	lockdep_assert_held(&req->ctx->uring_lock);
+
 	req_set_fail(req);
 	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
 	if (def->fail)
@@ -1615,6 +1618,7 @@ static u32 io_get_sequence(struct io_kiocb *req)
 }
 
 static __cold void io_drain_req(struct io_kiocb *req)
+	__must_hold(&ctx->uring_lock)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_defer_entry *de;
@@ -2849,7 +2853,7 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 	while (!list_empty(&list)) {
 		de = list_first_entry(&list, struct io_defer_entry, list);
 		list_del_init(&de->list);
-		io_req_complete_failed(de->req, -ECANCELED);
+		io_req_task_queue_fail(de->req, -ECANCELED);
 		kfree(de);
 	}
 	return true;
-- 
cgit v1.2.3


From 833b5dfffc26c81835ce38e2a5df9ac5fa142735 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:39 +0000
Subject: io_uring: remove io_req_tw_post_queue

Remove io_req_tw_post() and io_req_tw_post_queue(), we can use
io_req_task_complete() instead.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b9b73c08022c7f1457023ac841f35c0100e70345.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 12 ------------
 io_uring/io_uring.h |  8 +++++++-
 io_uring/timeout.c  |  6 +++---
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4d16f3b1ee11..e445344f6f07 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1226,18 +1226,6 @@ int io_run_local_work(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-static void io_req_tw_post(struct io_kiocb *req, bool *locked)
-{
-	io_req_complete_post(req);
-}
-
-void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
-{
-	io_req_set_res(req, res, cflags);
-	req->io_task_work.func = io_req_tw_post;
-	io_req_task_work_add(req);
-}
-
 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 {
 	/* not needed for normal modes, but SQPOLL depends on it */
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 83013ee584d6..222af88df10f 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -53,7 +53,6 @@ static inline bool io_req_ffs_set(struct io_kiocb *req)
 void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
-void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_queue(struct io_kiocb *req);
 void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
 void io_req_task_complete(struct io_kiocb *req, bool *locked);
@@ -375,4 +374,11 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
 		      ctx->submitter_task == current);
 }
 
+static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
+{
+	io_req_set_res(req, res, 0);
+	req->io_task_work.func = io_req_task_complete;
+	io_req_task_work_add(req);
+}
+
 #endif
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index a819818df7b3..5b4bc93fd6e0 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -63,7 +63,7 @@ static bool io_kill_timeout(struct io_kiocb *req, int status)
 		atomic_set(&req->ctx->cq_timeouts,
 			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&timeout->list);
-		io_req_tw_post_queue(req, status, 0);
+		io_req_queue_tw_complete(req, status);
 		return true;
 	}
 	return false;
@@ -159,7 +159,7 @@ void io_disarm_next(struct io_kiocb *req)
 		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_remove_next_linked(req);
-			io_req_tw_post_queue(link, -ECANCELED, 0);
+			io_req_queue_tw_complete(link, -ECANCELED);
 		}
 	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
 		struct io_ring_ctx *ctx = req->ctx;
@@ -168,7 +168,7 @@ void io_disarm_next(struct io_kiocb *req)
 		link = io_disarm_linked_timeout(req);
 		spin_unlock_irq(&ctx->timeout_lock);
 		if (link)
-			io_req_tw_post_queue(link, -ECANCELED, 0);
+			io_req_queue_tw_complete(link, -ECANCELED);
 	}
 	if (unlikely((req->flags & REQ_F_FAIL) &&
 		     !(req->flags & REQ_F_HARDLINK)))
-- 
cgit v1.2.3


From fa18fa2272c7469e470dcb7bf838ea50a25494ca Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:40 +0000
Subject: io_uring: inline __io_req_complete_put()

Inline __io_req_complete_put() into io_req_complete_post(), there are no
other users.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1923a4dfe80fa877f859a22ed3df2d5fc8ecf02b.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e445344f6f07..a0c71a2dce19 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -814,15 +814,19 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	return filled;
 }
 
-static void __io_req_complete_put(struct io_kiocb *req)
+void io_req_complete_post(struct io_kiocb *req)
 {
+	struct io_ring_ctx *ctx = req->ctx;
+
+	io_cq_lock(ctx);
+	if (!(req->flags & REQ_F_CQE_SKIP))
+		__io_fill_cqe_req(ctx, req);
+
 	/*
 	 * If we're the last reference to this request, add to our locked
 	 * free_list cache.
 	 */
 	if (req_ref_put_and_test(req)) {
-		struct io_ring_ctx *ctx = req->ctx;
-
 		if (req->flags & IO_REQ_LINK_FLAGS) {
 			if (req->flags & IO_DISARM_MASK)
 				io_disarm_next(req);
@@ -843,16 +847,6 @@ static void __io_req_complete_put(struct io_kiocb *req)
 		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 		ctx->locked_free_nr++;
 	}
-}
-
-void io_req_complete_post(struct io_kiocb *req)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-
-	io_cq_lock(ctx);
-	if (!(req->flags & REQ_F_CQE_SKIP))
-		__io_fill_cqe_req(ctx, req);
-	__io_req_complete_put(req);
 	io_cq_unlock_post(ctx);
 }
 
-- 
cgit v1.2.3


From 1bec951c3809051f64a6957fe86d1b4786cc0313 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 23 Nov 2022 11:33:41 +0000
Subject: io_uring: iopoll protect complete_post

io_req_complete_post() may be used by iopoll enabled rings, grab locks
in this case. That requires to pass issue_flags to propagate the locking
state.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/cc6d854065c57c838ca8e8806f707a226b70fd2d.1669203009.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c  | 21 +++++++++++++++------
 io_uring/io_uring.h  | 10 ++++++++--
 io_uring/kbuf.c      |  4 ++--
 io_uring/poll.c      |  2 +-
 io_uring/uring_cmd.c |  2 +-
 5 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a0c71a2dce19..cc27413129fc 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -814,7 +814,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	return filled;
 }
 
-void io_req_complete_post(struct io_kiocb *req)
+static void __io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -850,9 +850,18 @@ void io_req_complete_post(struct io_kiocb *req)
 	io_cq_unlock_post(ctx);
 }
 
-inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
+void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 {
-	io_req_complete_post(req);
+	if (!(issue_flags & IO_URING_F_UNLOCKED) ||
+	    !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
+		__io_req_complete_post(req);
+	} else {
+		struct io_ring_ctx *ctx = req->ctx;
+
+		mutex_lock(&ctx->uring_lock);
+		__io_req_complete_post(req);
+		mutex_unlock(&ctx->uring_lock);
+	}
 }
 
 void io_req_complete_failed(struct io_kiocb *req, s32 res)
@@ -866,7 +875,7 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res)
 	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
 	if (def->fail)
 		def->fail(req);
-	io_req_complete_post(req);
+	io_req_complete_post(req, 0);
 }
 
 /*
@@ -1450,7 +1459,7 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked)
 	if (*locked)
 		io_req_complete_defer(req);
 	else
-		io_req_complete_post(req);
+		io_req_complete_post_tw(req, locked);
 }
 
 /*
@@ -1718,7 +1727,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 		if (issue_flags & IO_URING_F_COMPLETE_DEFER)
 			io_req_complete_defer(req);
 		else
-			io_req_complete_post(req);
+			io_req_complete_post(req, issue_flags);
 	} else if (ret != IOU_ISSUE_SKIP_COMPLETE)
 		return ret;
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 222af88df10f..b5b80bf03385 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -31,14 +31,20 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx);
 int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
 int io_run_local_work(struct io_ring_ctx *ctx);
 void io_req_complete_failed(struct io_kiocb *req, s32 res);
-void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
-void io_req_complete_post(struct io_kiocb *req);
+void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
+static inline void io_req_complete_post_tw(struct io_kiocb *req, bool *locked)
+{
+	unsigned flags = *locked ? 0 : IO_URING_F_UNLOCKED;
+
+	io_req_complete_post(req, flags);
+}
+
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
 
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index e2c46889d5fa..e8150ed637d8 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -311,7 +311,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 
 	/* complete before unlock, IOPOLL may need the lock */
 	io_req_set_res(req, ret, 0);
-	__io_req_complete(req, issue_flags);
+	io_req_complete_post(req, 0);
 	io_ring_submit_unlock(ctx, issue_flags);
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
@@ -462,7 +462,7 @@ err:
 		req_set_fail(req);
 	/* complete before unlock, IOPOLL may need the lock */
 	io_req_set_res(req, ret, 0);
-	__io_req_complete(req, issue_flags);
+	io_req_complete_post(req, 0);
 	io_ring_submit_unlock(ctx, issue_flags);
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
diff --git a/io_uring/poll.c b/io_uring/poll.c
index cd4d98d622d2..4624e5eba63e 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -312,7 +312,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 	io_poll_tw_hash_eject(req, locked);
 
 	if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
-		io_req_complete_post(req);
+		io_req_complete_post_tw(req, locked);
 	else if (ret == IOU_POLL_DONE)
 		io_req_task_submit(req, locked);
 	else
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index e50de0b6b9f8..446a189b78b0 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -56,7 +56,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
 		/* order with io_iopoll_req_issued() checking ->iopoll_complete */
 		smp_store_release(&req->iopoll_completed, 1);
 	else
-		__io_req_complete(req, 0);
+		io_req_complete_post(req, 0);
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 
-- 
cgit v1.2.3


From 973fc83f3a94bdffcacf482641db38f57c7c8609 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:53 -0800
Subject: io_uring: defer all io_req_complete_failed

All failures happen under lock now, and can be deferred. To be consistent
when the failure has happened after some multishot cqe has been
deferred (and keep ordering), always defer failures.

To make this obvious at the caller (and to help prevent a future bug)
rename io_req_complete_failed to io_req_defer_failed.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-4-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 ++++++++---------
 io_uring/io_uring.h |  2 +-
 io_uring/poll.c     |  2 +-
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cc27413129fc..4888fe834920 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -864,7 +864,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 	}
 }
 
-void io_req_complete_failed(struct io_kiocb *req, s32 res)
+void io_req_defer_failed(struct io_kiocb *req, s32 res)
 	__must_hold(&ctx->uring_lock)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -875,7 +875,7 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res)
 	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
 	if (def->fail)
 		def->fail(req);
-	io_req_complete_post(req, 0);
+	io_req_complete_defer(req);
 }
 
 /*
@@ -1231,9 +1231,8 @@ int io_run_local_work(struct io_ring_ctx *ctx)
 
 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 {
-	/* not needed for normal modes, but SQPOLL depends on it */
 	io_tw_lock(req->ctx, locked);
-	io_req_complete_failed(req, req->cqe.res);
+	io_req_defer_failed(req, req->cqe.res);
 }
 
 void io_req_task_submit(struct io_kiocb *req, bool *locked)
@@ -1243,7 +1242,7 @@ void io_req_task_submit(struct io_kiocb *req, bool *locked)
 	if (likely(!(req->task->flags & PF_EXITING)))
 		io_queue_sqe(req);
 	else
-		io_req_complete_failed(req, -EFAULT);
+		io_req_defer_failed(req, -EFAULT);
 }
 
 void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -1630,7 +1629,7 @@ queue:
 	ret = io_req_prep_async(req);
 	if (ret) {
 fail:
-		io_req_complete_failed(req, ret);
+		io_req_defer_failed(req, ret);
 		return;
 	}
 	io_prep_async_link(req);
@@ -1860,7 +1859,7 @@ static void io_queue_async(struct io_kiocb *req, int ret)
 	struct io_kiocb *linked_timeout;
 
 	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
-		io_req_complete_failed(req, ret);
+		io_req_defer_failed(req, ret);
 		return;
 	}
 
@@ -1910,14 +1909,14 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
 		 */
 		req->flags &= ~REQ_F_HARDLINK;
 		req->flags |= REQ_F_LINK;
-		io_req_complete_failed(req, req->cqe.res);
+		io_req_defer_failed(req, req->cqe.res);
 	} else if (unlikely(req->ctx->drain_active)) {
 		io_drain_req(req);
 	} else {
 		int ret = io_req_prep_async(req);
 
 		if (unlikely(ret))
-			io_req_complete_failed(req, ret);
+			io_req_defer_failed(req, ret);
 		else
 			io_queue_iowq(req, NULL);
 	}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index b5b80bf03385..a26d5aa7f3f3 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -30,7 +30,7 @@ bool io_req_cqe_overflow(struct io_kiocb *req);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
 int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
 int io_run_local_work(struct io_ring_ctx *ctx);
-void io_req_complete_failed(struct io_kiocb *req, s32 res);
+void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 42aa10b50f6c..4bd43e6f5b72 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -317,7 +317,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 	else if (ret == IOU_POLL_DONE)
 		io_req_task_submit(req, locked);
 	else
-		io_req_complete_failed(req, ret);
+		io_req_defer_failed(req, ret);
 }
 
 static void __io_poll_execute(struct io_kiocb *req, int mask)
-- 
cgit v1.2.3


From 931147ddfa6e9ffd814272f1c0370c4740acbe17 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:54 -0800
Subject: io_uring: allow defer completion for aux posted cqes

Multishot ops cannot use the compl_reqs list as the request must stay in
the poll list, but that means they need to run each completion without
benefiting from batching.

Here introduce batching infrastructure for only small (ie 16 byte)
CQEs. This restriction is ok because there are no use cases posting 32
byte CQEs.

In the ring keep a batch of up to 16 posted results, and flush in the same
way as compl_reqs.

16 was chosen through experimentation on a microbenchmark ([1]), as well
as trying not to increase the size of the ring too much. This increases
the size to 1472 bytes from 1216.

[1]: https://github.com/DylanZA/liburing/commit/9ac66b36bcf4477bfafeff1c5f107896b7ae31cf
Run with $ make -j && ./benchmark/reg.b -s 1 -t 2000 -r 10
Gives results:
baseline	8309 k/s
8		18807 k/s
16		19338 k/s
32		20134 k/s

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-5-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 ++
 io_uring/io_uring.c            | 27 ++++++++++++++++++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f5b687a787a3..accdfecee953 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -174,7 +174,9 @@ struct io_submit_state {
 	bool			plug_started;
 	bool			need_plug;
 	unsigned short		submit_nr;
+	unsigned int		cqes_count;
 	struct blk_plug		plug;
+	struct io_uring_cqe	cqes[16];
 };
 
 struct io_ev_fd {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4888fe834920..28635e3e578a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -167,7 +167,8 @@ EXPORT_SYMBOL(io_uring_get_socket);
 
 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 {
-	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
+	if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
+	    ctx->submit_state.cqes_count)
 		__io_submit_flush_completions(ctx);
 }
 
@@ -802,6 +803,21 @@ bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 	return false;
 }
 
+static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_submit_state *state = &ctx->submit_state;
+	unsigned int i;
+
+	lockdep_assert_held(&ctx->uring_lock);
+	for (i = 0; i < state->cqes_count; i++) {
+		struct io_uring_cqe *cqe = &state->cqes[i];
+
+		io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags, true);
+	}
+	state->cqes_count = 0;
+}
+
 bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 		     u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow)
@@ -1323,6 +1339,9 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	struct io_submit_state *state = &ctx->submit_state;
 
 	io_cq_lock(ctx);
+	/* must come first to preserve CQE ordering in failure cases */
+	if (state->cqes_count)
+		__io_flush_post_cqes(ctx);
 	wq_list_for_each(node, prev, &state->compl_reqs) {
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 					    comp_list);
@@ -1332,8 +1351,10 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	}
 	io_cq_unlock_post(ctx);
 
-	io_free_batch_list(ctx, state->compl_reqs.first);
-	INIT_WQ_LIST(&state->compl_reqs);
+	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
+		io_free_batch_list(ctx, state->compl_reqs.first);
+		INIT_WQ_LIST(&state->compl_reqs);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 9b8c54755a2b16d4f23c0ea184b75e2edf77d906 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:55 -0800
Subject: io_uring: add io_aux_cqe which allows deferred completion

Use the just introduced deferred post cqe completion state when possible
in io_aux_cqe. If not possible fallback to io_post_aux_cqe.

This introduces a complication because of allow_overflow. For deferred
completions we cannot know without locking the completion_lock if it will
overflow (and even if we locked it, another post could sneak in and cause
this cqe to be in overflow).
However since overflow protection is mostly a best effort defence in depth
to prevent infinite loops of CQEs for poll, just checking the overflow bit
is going to be good enough and will result in at most 16 (array size of
deferred cqes) overflows.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-6-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 34 ++++++++++++++++++++++++++++++++++
 io_uring/io_uring.h |  2 ++
 io_uring/net.c      |  7 ++++---
 io_uring/poll.c     |  4 ++--
 4 files changed, 42 insertions(+), 5 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 28635e3e578a..056aea917cd6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -830,6 +830,40 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	return filled;
 }
 
+bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
+		bool allow_overflow)
+{
+	struct io_uring_cqe *cqe;
+	unsigned int length;
+
+	if (!defer)
+		return io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
+
+	length = ARRAY_SIZE(ctx->submit_state.cqes);
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (ctx->submit_state.cqes_count == length) {
+		io_cq_lock(ctx);
+		__io_flush_post_cqes(ctx);
+		/* no need to flush - flush is deferred */
+		spin_unlock(&ctx->completion_lock);
+	}
+
+	/* For defered completions this is not as strict as it is otherwise,
+	 * however it's main job is to prevent unbounded posted completions,
+	 * and in that it works just as well.
+	 */
+	if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
+		return false;
+
+	cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++];
+	cqe->user_data = user_data;
+	cqe->res = res;
+	cqe->flags = cflags;
+	return true;
+}
+
 static void __io_req_complete_post(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index a26d5aa7f3f3..dd02adf3d0df 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -36,6 +36,8 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 		     bool allow_overflow);
 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
+bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
+		bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
 static inline void io_req_complete_post_tw(struct io_kiocb *req, bool *locked)
diff --git a/io_uring/net.c b/io_uring/net.c
index 0de6f78ad978..90342dcb6b1d 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -601,8 +601,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 	}
 
 	if (!mshot_finished) {
-		if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
-				    cflags | IORING_CQE_F_MORE, true)) {
+		if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
+			       req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) {
 			io_recv_prep_retry(req);
 			return false;
 		}
@@ -1320,7 +1320,8 @@ retry:
 
 	if (ret < 0)
 		return ret;
-	if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
+	if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
+		       req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
 		goto retry;
 
 	return -ECANCELED;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 4bd43e6f5b72..922c1a366c41 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -252,8 +252,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
 
-			if (!io_post_aux_cqe(ctx, req->cqe.user_data,
-					     mask, IORING_CQE_F_MORE, false)) {
+			if (!io_aux_cqe(ctx, *locked, req->cqe.user_data,
+					mask, IORING_CQE_F_MORE, false)) {
 				io_req_set_res(req, mask, 0);
 				return IOU_POLL_REMOVE_POLL_USE_RES;
 			}
-- 
cgit v1.2.3


From a77ab745f28d5ab2ce51d0e44e85af942bb77d47 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:56 -0800
Subject: io_uring: make io_fill_cqe_aux static

This is only used in io_uring.c

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-7-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 4 ++--
 io_uring/io_uring.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 056aea917cd6..fea84e51e56f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -770,8 +770,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
 	return &rings->cqes[off];
 }
 
-bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-		     bool allow_overflow)
+static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
+			    bool allow_overflow)
 {
 	struct io_uring_cqe *cqe;
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index dd02adf3d0df..46694f40bf72 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -34,8 +34,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
 		     bool allow_overflow);
-bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-		     bool allow_overflow);
 bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
 		bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
-- 
cgit v1.2.3


From 2e2ef4a1dab980d88a1ab45bf0e28c8851999e33 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:57 -0800
Subject: io_uring: add lockdep assertion in io_fill_cqe_aux

Add an assertion for the completion lock to io_fill_cqe_aux

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-8-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fea84e51e56f..03051e1fa02e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -775,6 +775,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32
 {
 	struct io_uring_cqe *cqe;
 
+	lockdep_assert_held(&ctx->completion_lock);
+
 	ctx->cq_extra++;
 
 	/*
-- 
cgit v1.2.3


From b529c96a896b7bea8464a58d350836cc106d70bd Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:58 -0800
Subject: io_uring: remove overflow param from io_post_aux_cqe

The only call sites which would not allow overflow are also call sites
which would use the io_aux_cqe as they care about ordering.

So remove this parameter from io_post_aux_cqe.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-9-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 12 ++++++++----
 io_uring/io_uring.h |  3 +--
 io_uring/msg_ring.c |  4 ++--
 io_uring/rsrc.c     |  4 ++--
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 03051e1fa02e..7ed9cbeb573f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -820,9 +820,8 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
 	state->cqes_count = 0;
 }
 
-bool io_post_aux_cqe(struct io_ring_ctx *ctx,
-		     u64 user_data, s32 res, u32 cflags,
-		     bool allow_overflow)
+static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
+			      bool allow_overflow)
 {
 	bool filled;
 
@@ -832,6 +831,11 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx,
 	return filled;
 }
 
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
+{
+	return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
+}
+
 bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
 		bool allow_overflow)
 {
@@ -839,7 +843,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
 	unsigned int length;
 
 	if (!defer)
-		return io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
+		return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
 
 	length = ARRAY_SIZE(ctx->submit_state.cqes);
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 46694f40bf72..dcb8e3468f1d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -32,8 +32,7 @@ int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
 int io_run_local_work(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
-bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
-		     bool allow_overflow);
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
 bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
 		bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 90d2fc6fd80e..afb543aab9f6 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -31,7 +31,7 @@ static int io_msg_ring_data(struct io_kiocb *req)
 	if (msg->src_fd || msg->dst_fd || msg->flags)
 		return -EINVAL;
 
-	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
+	if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
 		return 0;
 
 	return -EOVERFLOW;
@@ -116,7 +116,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 	 * completes with -EOVERFLOW, then the sender must ensure that a
 	 * later IORING_OP_MSG_RING delivers the message.
 	 */
-	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
+	if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
 		ret = -EOVERFLOW;
 out_unlock:
 	io_double_unlock_ctx(ctx, target_ctx, issue_flags);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 187f1c83e779..133608200769 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -170,10 +170,10 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 		if (prsrc->tag) {
 			if (ctx->flags & IORING_SETUP_IOPOLL) {
 				mutex_lock(&ctx->uring_lock);
-				io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
+				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
 				mutex_unlock(&ctx->uring_lock);
 			} else {
-				io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true);
+				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
 			}
 		}
 
-- 
cgit v1.2.3


From 9a6924519e5e882631a7fff429facca838207e45 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Thu, 24 Nov 2022 01:35:59 -0800
Subject: io_uring: allow multishot polled reqs to defer completion

Until now there was no reason for multishot polled requests to defer
completions as there was no functional difference. However now this will
actually defer the completions, for a performance win.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221124093559.3780686-10-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7ed9cbeb573f..72c97af4f292 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1803,7 +1803,8 @@ int io_poll_issue(struct io_kiocb *req, bool *locked)
 	io_tw_lock(req->ctx, locked);
 	if (unlikely(req->task->flags & PF_EXITING))
 		return -EFAULT;
-	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT);
+	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
+				 IO_URING_F_COMPLETE_DEFER);
 }
 
 struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
-- 
cgit v1.2.3


From 27f35fe9096b183d45ff6f22ad277ddf107d8428 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Fri, 25 Nov 2022 02:34:10 -0800
Subject: io_uring: remove io_req_complete_post_tw

It's only used in one place. Inline it.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221125103412.1425305-2-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 io_uring/io_uring.h | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 72c97af4f292..24aa049fe7e1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1519,7 +1519,7 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked)
 	if (*locked)
 		io_req_complete_defer(req);
 	else
-		io_req_complete_post_tw(req, locked);
+		io_req_complete_post(req, IO_URING_F_UNLOCKED);
 }
 
 /*
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index dcb8e3468f1d..76659d2fc90c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -37,13 +37,6 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32
 		bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
-static inline void io_req_complete_post_tw(struct io_kiocb *req, bool *locked)
-{
-	unsigned flags = *locked ? 0 : IO_URING_F_UNLOCKED;
-
-	io_req_complete_post(req, flags);
-}
-
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
 
 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
-- 
cgit v1.2.3


From 10d8bc35416d9e83ffe9644478756281c7bd4f52 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Fri, 25 Nov 2022 02:34:11 -0800
Subject: io_uring: spelling fix

s/pushs/pushes/

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20221125103412.1425305-3-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 24aa049fe7e1..d9c9e347346d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2706,7 +2706,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	 * lock(&ep->mtx);
 	 *
 	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
-	 * pushs them to do the flush.
+	 * pushes them to do the flush.
 	 */
 
 	if (io_cqring_events(ctx) || io_has_work(ctx))
-- 
cgit v1.2.3


From 5d772916855f593672de55c437925daccc8ecd73 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 24 Nov 2022 19:46:41 +0000
Subject: io_uring: keep unlock_post inlined in hot path

This partially reverts

6c16fe3c16bdc ("io_uring: kill io_cqring_ev_posted() and __io_cq_unlock_post()")

The redundancy of __io_cq_unlock_post() was always to keep it inlined
into __io_submit_flush_completions(). Inline it back and rename with
hope of clarifying the intention behind it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/372a16c485fca44c069be2e92fc5e7332a1d7fd7.1669310258.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d9c9e347346d..adecdf65b130 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -582,7 +582,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		io_eventfd_flush_signal(ctx);
 }
 
-void io_cq_unlock_post(struct io_ring_ctx *ctx)
+/* keep it inlined for io_submit_flush_completions() */
+static inline void io_cq_unlock_post_inline(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
 {
 	io_commit_cqring(ctx);
@@ -592,6 +593,12 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx)
 	io_cqring_wake(ctx);
 }
 
+void io_cq_unlock_post(struct io_ring_ctx *ctx)
+	__releases(ctx->completion_lock)
+{
+	io_cq_unlock_post_inline(ctx);
+}
+
 /* Returns true if there are no backlogged entries after the flush */
 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
@@ -1389,7 +1396,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		if (!(req->flags & REQ_F_CQE_SKIP))
 			__io_fill_cqe_req(ctx, req);
 	}
-	io_cq_unlock_post(ctx);
+	io_cq_unlock_post_inline(ctx);
 
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
 		io_free_batch_list(ctx, state->compl_reqs.first);
-- 
cgit v1.2.3