From 87c1696655787895689618c8b63c5efe66b8f2ab Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 21 Sep 2021 08:24:57 -0600
Subject: io-wq: ensure we exit if thread group is exiting

Dave reports that a coredumping workload gets stuck in 5.15-rc2, and
identified the culprit in the Fixes line below. The problem is that
relying solely on fatal_signal_pending() to gate whether to exit or not
fails miserably if a process gets eg SIGILL sent. Don't exclusively
rely on fatal signals, also check if the thread group is exiting.

Fixes: 15e20db2e0ce ("io-wq: only exit on fatal signals")
Reported-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index c2e0e8e80949..c2360cdc403d 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -584,7 +584,8 @@ loop:
 
 			if (!get_signal(&ksig))
 				continue;
-			if (fatal_signal_pending(current))
+			if (fatal_signal_pending(current) ||
+			    signal_group_exit(current->signal))
 				break;
 			continue;
 		}
-- 
cgit v1.2.3


From bd99c71bd14072ce2920f6d0c2fe43df072c653c Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Wed, 22 Sep 2021 18:12:36 +0800
Subject: io_uring: fix race between poll completion and cancel_hash insertion

If poll arming and poll completion runs in parallel, there maybe races.
For instance, run io_poll_add in iowq and io_poll_task_func in original
context, then:

  iowq                                      original context
  io_poll_add
    vfs_poll
     (interruption happens
      tw queued to original
      context)                              io_poll_task_func
                                              generate cqe
                                              del from cancel_hash[]
    if !poll.done
      insert to cancel_hash[]

The entry left in cancel_hash[], similar case for fast poll.
Fix it by set poll.done = true when del from cancel_hash[].

Fixes: 5082620fb2ca ("io_uring: terminate multishot poll for CQ ring overflow")
Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20210922101238.7177-2-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e372d5b9f6dc..43530aae6180 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5337,10 +5337,8 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
 	}
 	if (req->poll.events & EPOLLONESHOT)
 		flags = 0;
-	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
-		req->poll.done = true;
+	if (!io_cqring_fill_event(ctx, req->user_data, error, flags))
 		flags = 0;
-	}
 	if (flags & IORING_CQE_F_MORE)
 		ctx->cq_extra++;
 
@@ -5371,6 +5369,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 		if (done) {
 			io_poll_remove_double(req);
 			hash_del(&req->hash_node);
+			req->poll.done = true;
 		} else {
 			req->result = 0;
 			add_wait_queue(req->poll.head, &req->poll.wait);
@@ -5508,6 +5507,7 @@ static void io_async_task_func(struct io_kiocb *req, bool *locked)
 
 	hash_del(&req->hash_node);
 	io_poll_remove_double(req);
+	apoll->poll.done = true;
 	spin_unlock(&ctx->completion_lock);
 
 	if (!READ_ONCE(apoll->poll.canceled))
-- 
cgit v1.2.3


From a62682f92eedb41c1cd8290fa875a4b85624fb9a Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Wed, 22 Sep 2021 18:12:37 +0800
Subject: io_uring: fix missing set of EPOLLONESHOT for CQ ring overflow

We should set EPOLLONESHOT if cqring_fill_event() returns false since
io_poll_add() decides to put req or not by it.

Fixes: 5082620fb2ca ("io_uring: terminate multishot poll for CQ ring overflow")
Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20210922101238.7177-3-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 43530aae6180..ac0c06d5c629 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5337,8 +5337,10 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
 	}
 	if (req->poll.events & EPOLLONESHOT)
 		flags = 0;
-	if (!io_cqring_fill_event(ctx, req->user_data, error, flags))
+	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
+		req->poll.events |= EPOLLONESHOT;
 		flags = 0;
+	}
 	if (flags & IORING_CQE_F_MORE)
 		ctx->cq_extra++;
 
-- 
cgit v1.2.3


From 5b7aa38d86f348847a48f71e9ac7715406de900e Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Wed, 22 Sep 2021 18:12:38 +0800
Subject: io_uring: fix potential req refcount underflow

For multishot mode, there may be cases like:

iowq                                 original context
io_poll_add
  _arm_poll()
  mask = vfs_poll() is not 0
  if mask
(2)  io_poll_complete()
  compl_unlock
   (interruption happens
    tw queued to original
    context)
                                     io_poll_task_func()
                                     compl_lock
                                 (3) done = io_poll_complete() is true
                                     compl_unlock
                                     put req ref
(1) if (poll->flags & EPOLLONESHOT)
      put req ref

EPOLLONESHOT flag in (1) may be from (2) or (3), so there are multiple
combinations that can cause ref underfow.
Let's address it by:
- check the return value in (2) as done
- change (1) to if (done)
    in this way, we only do ref put in (1) if 'oneshot flag' is from
    (2)
- do poll.done check in io_poll_task_func(), so that we won't put ref
  for the second time.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20210922101238.7177-4-haoxu@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ac0c06d5c629..7707cdb7b372 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5367,6 +5367,10 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 	} else {
 		bool done;
 
+		if (req->poll.done) {
+			spin_unlock(&ctx->completion_lock);
+			return;
+		}
 		done = __io_poll_complete(req, req->result);
 		if (done) {
 			io_poll_remove_double(req);
@@ -5830,6 +5834,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_poll_table ipt;
 	__poll_t mask;
+	bool done;
 
 	ipt.pt._qproc = io_poll_queue_proc;
 
@@ -5838,13 +5843,13 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (mask) { /* no async, we'd stolen it */
 		ipt.error = 0;
-		io_poll_complete(req, mask);
+		done = io_poll_complete(req, mask);
 	}
 	spin_unlock(&ctx->completion_lock);
 
 	if (mask) {
 		io_cqring_ev_posted(ctx);
-		if (poll->events & EPOLLONESHOT)
+		if (done)
 			io_put_req(req);
 	}
 	return ipt.error;
-- 
cgit v1.2.3


From 8bab4c09f24ec8d4a7a78ab343620f89d3a24804 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 24 Sep 2021 07:12:27 -0600
Subject: io_uring: allow conditional reschedule for intensive iterators

If we have a lot of threads and rings, the tctx list can get quite big.
This is especially true if we keep creating new threads and rings.
Likewise for the provided buffers list. Be nice and insert a conditional
reschedule point while iterating the nodes for deletion.

Link: https://lore.kernel.org/io-uring/00000000000064b6b405ccb41113@google.com/
Reported-by: syzbot+111d2a03f51f5ae73775@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7707cdb7b372..ef3c94a55fbd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9173,8 +9173,10 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 	struct io_buffer *buf;
 	unsigned long index;
 
-	xa_for_each(&ctx->io_buffers, index, buf)
+	xa_for_each(&ctx->io_buffers, index, buf) {
 		__io_remove_buffers(ctx, buf, index, -1U);
+		cond_resched();
+	}
 }
 
 static void io_req_cache_free(struct list_head *list)
@@ -9672,8 +9674,10 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
 	struct io_tctx_node *node;
 	unsigned long index;
 
-	xa_for_each(&tctx->xa, index, node)
+	xa_for_each(&tctx->xa, index, node) {
 		io_uring_del_tctx_node(index);
+		cond_resched();
+	}
 	if (wq) {
 		/*
 		 * Must be after io_uring_del_task_file() (removes nodes under
-- 
cgit v1.2.3


From 9990da93d2bf9892c2c14c958bef050d4e461a1a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 24 Sep 2021 07:39:08 -0600
Subject: io_uring: put provided buffer meta data under memcg accounting

For each provided buffer, we allocate a struct io_buffer to hold the
data associated with it. As a large number of buffers can be provided,
account that data with memcg.

Fixes: ddf0322db79c ("io_uring: add IORING_OP_PROVIDE_BUFFERS")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ef3c94a55fbd..01e49d01fe74 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4387,7 +4387,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
 	int i, bid = pbuf->bid;
 
 	for (i = 0; i < pbuf->nbufs; i++) {
-		buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+		buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
 		if (!buf)
 			break;
 
-- 
cgit v1.2.3


From cdb31c29d397a8076d81fd1458d091c647ef94ba Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 24 Sep 2021 08:43:54 -0600
Subject: io_uring: don't punt files update to io-wq unconditionally

There's no reason to punt it unconditionally, we just need to ensure that
the submit lock grabbing is conditional.

Fixes: 05f3fb3c5397 ("io_uring: avoid ring quiesce for fixed file set unregister and update")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 01e49d01fe74..c6139ace11fa 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6340,19 +6340,16 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_uring_rsrc_update2 up;
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
-
 	up.offset = req->rsrc_update.offset;
 	up.data = req->rsrc_update.arg;
 	up.nr = 0;
 	up.tags = 0;
 	up.resv = 0;
 
-	mutex_lock(&ctx->uring_lock);
+	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
 	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
 					&up, req->rsrc_update.nr_args);
-	mutex_unlock(&ctx->uring_lock);
+	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
 
 	if (ret < 0)
 		req_set_fail(req);
-- 
cgit v1.2.3


From 9f3a2cb228c28606895d15f13b30d1f7402dc745 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 24 Sep 2021 17:14:48 +0100
Subject: io_uring: kill extra checks in io_write()

We don't retry short writes and so we would never get to async setup in
io_write() in that case. Thus ret2 > 0 is always false and
iov_iter_advance() is never used. Apparently, the same is found by
Coverity, which complains on the code.

Fixes: cd65869512ab ("io_uring: use iov_iter state save/restore helpers")
Reported-by: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/5b33e61034748ef1022766efc0fb8854cfcf749c.1632500058.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c6139ace11fa..2b3232d53e79 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3605,7 +3605,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		iov_iter_save_state(iter, state);
 	}
 	req->result = iov_iter_count(iter);
-	ret2 = 0;
 
 	/* Ensure we clear previously set non-block flag */
 	if (!force_nonblock)
@@ -3670,8 +3669,6 @@ done:
 	} else {
 copy_iov:
 		iov_iter_restore(iter, state);
-		if (ret2 > 0)
-			iov_iter_advance(iter, ret2);
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
 		return ret ?: -EAGAIN;
 	}
-- 
cgit v1.2.3


From 7df778be2f61e1a23002d1f2f5d6aaf702771eb8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 24 Sep 2021 20:04:29 +0100
Subject: io_uring: make OP_CLOSE consistent with direct open

From recently open/accept are now able to manipulate fixed file table,
but it's inconsistent that close can't. Close the gap, keep API same as
with open/accept, i.e. via sqe->file_slot.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2b3232d53e79..82f867983bb3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -502,6 +502,7 @@ struct io_poll_update {
 struct io_close {
 	struct file			*file;
 	int				fd;
+	u32				file_slot;
 };
 
 struct io_timeout_data {
@@ -1098,6 +1099,8 @@ static int io_req_prep_async(struct io_kiocb *req);
 
 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 				 unsigned int issue_flags, u32 slot_index);
+static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
+
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 
 static struct kmem_cache *req_cachep;
@@ -4591,12 +4594,16 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
-	    sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+	    sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
 	if (req->flags & REQ_F_FIXED_FILE)
 		return -EBADF;
 
 	req->close.fd = READ_ONCE(sqe->fd);
+	req->close.file_slot = READ_ONCE(sqe->file_index);
+	if (req->close.file_slot && req->close.fd)
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -4608,6 +4615,11 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 	struct file *file = NULL;
 	int ret = -EBADF;
 
+	if (req->close.file_slot) {
+		ret = io_close_fixed(req, issue_flags);
+		goto err;
+	}
+
 	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
 	if (close->fd >= fdt->max_fds) {
@@ -8401,6 +8413,44 @@ err:
 	return ret;
 }
 
+static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
+{
+	unsigned int offset = req->close.file_slot - 1;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_fixed_file *file_slot;
+	struct file *file;
+	int ret, i;
+
+	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+	ret = -ENXIO;
+	if (unlikely(!ctx->file_data))
+		goto out;
+	ret = -EINVAL;
+	if (offset >= ctx->nr_user_files)
+		goto out;
+	ret = io_rsrc_node_switch_start(ctx);
+	if (ret)
+		goto out;
+
+	i = array_index_nospec(offset, ctx->nr_user_files);
+	file_slot = io_fixed_file_slot(&ctx->file_table, i);
+	ret = -EBADF;
+	if (!file_slot->file_ptr)
+		goto out;
+
+	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
+	if (ret)
+		goto out;
+
+	file_slot->file_ptr = 0;
+	io_rsrc_node_switch(ctx, ctx->file_data);
+	ret = 0;
+out:
+	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+	return ret;
+}
+
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update2 *up,
 				 unsigned nr_args)
-- 
cgit v1.2.3