From 16c8d2df7ec0eed31b7d3b61cb13206a7fb930cc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 12 Sep 2021 06:45:07 -0600
Subject: io_uring: ensure symmetry in handling iter types in loop_rw_iter()

When setting up the next segment, we check what type the iter is and
handle it accordingly. However, when incrementing and processed amount
we do not, and both iter advance and addr/len are adjusted, regardless
of type. Split the increment side just like we do on the setup side.

Fixes: 4017eb91a9e7 ("io_uring: make loop_rw_iter() use original user supplied pointers")
Cc: stable@vger.kernel.org
Reported-by: Valentina Palmiotti <vpalmiotti@gmail.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 16fb7436043c..66a7414c3756 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3263,12 +3263,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
 				ret = nr;
 			break;
 		}
+		if (!iov_iter_is_bvec(iter)) {
+			iov_iter_advance(iter, nr);
+		} else {
+			req->rw.len -= nr;
+			req->rw.addr += nr;
+		}
 		ret += nr;
 		if (nr != iovec.iov_len)
 			break;
-		req->rw.len -= nr;
-		req->rw.addr += nr;
-		iov_iter_advance(iter, nr);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 7a842fb589e3cdbe205bc16dc37c30cf13383159 Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Sun, 12 Sep 2021 03:40:50 +0800
Subject: io-wq: code clean of io_wqe_create_worker()

Remove do_create to save a local variable.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 6c55362c1f99..a1685b40a4bf 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -246,8 +246,6 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
  */
 static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
 {
-	bool do_create = false;
-
 	/*
 	 * Most likely an attempt to queue unbounded work on an io_wq that
 	 * wasn't setup with any unbounded workers.
@@ -256,18 +254,15 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
 		pr_warn_once("io-wq is not configured for unbound workers");
 
 	raw_spin_lock(&wqe->lock);
-	if (acct->nr_workers < acct->max_workers) {
-		acct->nr_workers++;
-		do_create = true;
+	if (acct->nr_workers == acct->max_workers) {
+		raw_spin_unlock(&wqe->lock);
+		return true;
 	}
+	acct->nr_workers++;
 	raw_spin_unlock(&wqe->lock);
-	if (do_create) {
-		atomic_inc(&acct->nr_running);
-		atomic_inc(&wqe->wq->worker_refs);
-		return create_io_worker(wqe->wq, wqe, acct->index);
-	}
-
-	return true;
+	atomic_inc(&acct->nr_running);
+	atomic_inc(&wqe->wq->worker_refs);
+	return create_io_worker(wqe->wq, wqe, acct->index);
 }
 
 static void io_wqe_inc_running(struct io_worker *worker)
-- 
cgit v1.2.3


From 767a65e9f31789d80e41edd03a802314905e8fbf Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Sun, 12 Sep 2021 03:40:52 +0800
Subject: io-wq: fix potential race of acct->nr_workers

Given max_worker is 1, and we currently have 1 running and it is
exiting. There may be race like:
 io_wqe_enqueue                   worker1
                               no work there and timeout
                               unlock(wqe->lock)
 ->insert work
                               -->io_worker_exit
 lock(wqe->lock)
 ->if(!nr_workers) //it's still 1
 unlock(wqe->lock)
    goto run_cancel
                                  lock(wqe->lock)
                                  nr_workers--
                                  ->dec_running
                                    ->worker creation fails
                                  unlock(wqe->lock)

We enqueued one work but there is no workers, causes hung.

Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index a1685b40a4bf..3d4460df845c 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -176,7 +176,6 @@ static void io_worker_ref_put(struct io_wq *wq)
 static void io_worker_exit(struct io_worker *worker)
 {
 	struct io_wqe *wqe = worker->wqe;
-	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
 
 	if (refcount_dec_and_test(&worker->ref))
 		complete(&worker->ref_done);
@@ -186,7 +185,6 @@ static void io_worker_exit(struct io_worker *worker)
 	if (worker->flags & IO_WORKER_F_FREE)
 		hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
-	acct->nr_workers--;
 	preempt_disable();
 	io_wqe_dec_running(worker);
 	worker->flags = 0;
@@ -569,6 +567,7 @@ loop:
 		}
 		/* timed out, exit unless we're the last worker */
 		if (last_timeout && acct->nr_workers > 1) {
+			acct->nr_workers--;
 			raw_spin_unlock(&wqe->lock);
 			__set_current_state(TASK_RUNNING);
 			break;
-- 
cgit v1.2.3


From dd47c104533dedb90434a3f142e94a671ac623a6 Mon Sep 17 00:00:00 2001
From: Eugene Syromiatnikov <esyr@redhat.com>
Date: Mon, 13 Sep 2021 17:44:15 +0200
Subject: io-wq: provide IO_WQ_* constants for IORING_REGISTER_IOWQ_MAX_WORKERS
 arg items

The items passed in the array pointed by the arg parameter
of IORING_REGISTER_IOWQ_MAX_WORKERS io_uring_register operation
carry certain semantics: they refer to different io-wq worker categories;
provide IO_WQ_* constants in the UAPI, so these categories can be referenced
in the user space code.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Complements: 2e480058ddc21ec5 ("io-wq: provide a way to limit max number of workers")
Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
Link: https://lore.kernel.org/r/20210913154415.GA12890@asgard.redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c                    | 5 +++++
 include/uapi/linux/io_uring.h | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 3d4460df845c..c2e0e8e80949 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -14,6 +14,7 @@
 #include <linux/rculist_nulls.h>
 #include <linux/cpu.h>
 #include <linux/tracehook.h>
+#include <uapi/linux/io_uring.h>
 
 #include "io-wq.h"
 
@@ -1281,6 +1282,10 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
 {
 	int i, node, prev = 0;
 
+	BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND   != (int) IO_WQ_BOUND);
+	BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
+	BUILD_BUG_ON((int) IO_WQ_ACCT_NR      != 2);
+
 	for (i = 0; i < 2; i++) {
 		if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
 			new_count[i] = task_rlimit(current, RLIMIT_NPROC);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 59ef35154e3d..b270a07b285e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -317,13 +317,19 @@ enum {
 	IORING_REGISTER_IOWQ_AFF		= 17,
 	IORING_UNREGISTER_IOWQ_AFF		= 18,
 
-	/* set/get max number of workers */
+	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
 
+/* io-wq worker categories */
+enum {
+	IO_WQ_BOUND,
+	IO_WQ_UNBOUND,
+};
+
 /* deprecated, see struct io_uring_rsrc_update */
 struct io_uring_files_update {
 	__u32 offset;
-- 
cgit v1.2.3


From 41d3a6bd1d37149b18331fc4bb789c5456a7aeb0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 13 Sep 2021 13:08:51 -0600
Subject: io_uring: pin SQPOLL data before unlocking ring lock

We need to re-check sqd->thread after we've dropped the lock. Pin
the sqd before doing the lockdep lock dance, and check if the thread
is alive after that. It's either NULL or alive, as the SQPOLL thread
cannot exit without holding the same sqd->lock.

Reported-and-tested-by: syzbot+337de45f13a4fd54d708@syzkaller.appspotmail.com
Fixes: fa84693b3c89 ("io_uring: ensure IORING_REGISTER_IOWQ_MAX_WORKERS works with SQPOLL")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 66a7414c3756..a864a94364c6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10563,10 +10563,12 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 			 * ordering. Fine to drop uring_lock here, we hold
 			 * a ref to the ctx.
 			 */
+			refcount_inc(&sqd->refs);
 			mutex_unlock(&ctx->uring_lock);
 			mutex_lock(&sqd->lock);
 			mutex_lock(&ctx->uring_lock);
-			tctx = sqd->thread->io_uring;
+			if (sqd->thread)
+				tctx = sqd->thread->io_uring;
 		}
 	} else {
 		tctx = current->io_uring;
@@ -10580,16 +10582,20 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 	if (ret)
 		goto err;
 
-	if (sqd)
+	if (sqd) {
 		mutex_unlock(&sqd->lock);
+		io_put_sq_data(sqd);
+	}
 
 	if (copy_to_user(arg, new_count, sizeof(new_count)))
 		return -EFAULT;
 
 	return 0;
 err:
-	if (sqd)
+	if (sqd) {
 		mutex_unlock(&sqd->lock);
+		io_put_sq_data(sqd);
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3


From 44df58d441a94de40d52fca67dc60790daee4266 Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Date: Tue, 14 Sep 2021 22:38:52 +0800
Subject: io_uring: fix missing sigmask restore in io_cqring_wait()

Move get_timespec() section in io_cqring_wait() before the sigmask
saving, otherwise we'll fail to restore sigmask once get_timespec()
returns error.

Fixes: c73ebb685fb6 ("io_uring: add timeout support for io_uring_enter()")
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Link: https://lore.kernel.org/r/20210914143852.9663-1-xiaoguang.wang@linux.alibaba.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a864a94364c6..94afd087e97b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7518,6 +7518,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			break;
 	} while (1);
 
+	if (uts) {
+		struct timespec64 ts;
+
+		if (get_timespec64(&ts, uts))
+			return -EFAULT;
+		timeout = timespec64_to_jiffies(&ts);
+	}
+
 	if (sig) {
 #ifdef CONFIG_COMPAT
 		if (in_compat_syscall())
@@ -7531,14 +7539,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	if (uts) {
-		struct timespec64 ts;
-
-		if (get_timespec64(&ts, uts))
-			return -EFAULT;
-		timeout = timespec64_to_jiffies(&ts);
-	}
-
 	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
 	iowq.wq.private = current;
 	INIT_LIST_HEAD(&iowq.wq.entry);
-- 
cgit v1.2.3


From 9c7b0ba887513b6681e7883d306df0a0fd580afa Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 14 Sep 2021 16:12:52 +0100
Subject: io_uring: auto-removal for direct open/accept

It might be inconvenient that direct open/accept deviates from the
update semantics and fails if the slot is taken instead of removing a
file sitting there. Implement this auto-removal.

Note that removal might need to allocate and so may fail. However, if an
empty slot is specified, it's guaraneed to not fail on the fd
installation side for valid userspace programs. It's needed for users
who can't tolerate such failures, e.g. accept where the other end
never retries.

Suggested-by: Franz-B. Tuneke <franz-bernhard.tuneke@tu-dortmund.de>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c896f14ea46b0eaa6c09d93149e665c2c37979b4.1631632300.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 52 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 94afd087e97b..ae489ab275dd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8287,11 +8287,27 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
+static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+				 struct io_rsrc_node *node, void *rsrc)
+{
+	struct io_rsrc_put *prsrc;
+
+	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+	if (!prsrc)
+		return -ENOMEM;
+
+	prsrc->tag = *io_get_tag_slot(data, idx);
+	prsrc->rsrc = rsrc;
+	list_add(&prsrc->list, &node->rsrc_list);
+	return 0;
+}
+
 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 				 unsigned int issue_flags, u32 slot_index)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	bool needs_switch = false;
 	struct io_fixed_file *file_slot;
 	int ret = -EBADF;
 
@@ -8307,9 +8323,22 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 
 	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
 	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
-	ret = -EBADF;
-	if (file_slot->file_ptr)
-		goto err;
+
+	if (file_slot->file_ptr) {
+		struct file *old_file;
+
+		ret = io_rsrc_node_switch_start(ctx);
+		if (ret)
+			goto err;
+
+		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+					    ctx->rsrc_node, old_file);
+		if (ret)
+			goto err;
+		file_slot->file_ptr = 0;
+		needs_switch = true;
+	}
 
 	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
 	io_fixed_file_set(file_slot, file);
@@ -8321,27 +8350,14 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 
 	ret = 0;
 err:
+	if (needs_switch)
+		io_rsrc_node_switch(ctx, ctx->file_data);
 	io_ring_submit_unlock(ctx, !force_nonblock);
 	if (ret)
 		fput(file);
 	return ret;
 }
 
-static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
-				 struct io_rsrc_node *node, void *rsrc)
-{
-	struct io_rsrc_put *prsrc;
-
-	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
-	if (!prsrc)
-		return -ENOMEM;
-
-	prsrc->tag = *io_get_tag_slot(data, idx);
-	prsrc->rsrc = rsrc;
-	list_add(&prsrc->list, &node->rsrc_list);
-	return 0;
-}
-
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_rsrc_update2 *up,
 				 unsigned nr_args)
-- 
cgit v1.2.3


From 5d329e1286b0a040264e239b80257c937f6e685f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 Sep 2021 11:08:37 -0600
Subject: io_uring: allow retry for O_NONBLOCK if async is supported

A common complaint is that using O_NONBLOCK files with io_uring can be a
bit of a pain. Be a bit nicer and allow normal retry IFF the file does
support async behavior. This makes it possible to use io_uring more
reliably with O_NONBLOCK files, for use cases where it either isn't
possible or feasible to modify the file flags.

Cc: stable@vger.kernel.org
Reported-and-tested-by: Dan Melnic <dmm@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ae489ab275dd..3077f85a2638 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2843,7 +2843,8 @@ static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
 	return __io_file_supports_nowait(req->file, rw);
 }
 
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      int rw)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw.kiocb;
@@ -2865,8 +2866,13 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(ret))
 		return ret;
 
-	/* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
-	if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
+	/*
+	 * If the file is marked O_NONBLOCK, still allow retry for it if it
+	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
+	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
+	 */
+	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
+	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
 		req->flags |= REQ_F_NOWAIT;
 
 	ioprio = READ_ONCE(sqe->ioprio);
@@ -3349,7 +3355,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	if (unlikely(!(req->file->f_mode & FMODE_READ)))
 		return -EBADF;
-	return io_prep_rw(req, sqe);
+	return io_prep_rw(req, sqe, READ);
 }
 
 /*
@@ -3542,7 +3548,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
 		return -EBADF;
-	return io_prep_rw(req, sqe);
+	return io_prep_rw(req, sqe, WRITE);
 }
 
 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
-- 
cgit v1.2.3