From 87c1696655787895689618c8b63c5efe66b8f2ab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 21 Sep 2021 08:24:57 -0600 Subject: io-wq: ensure we exit if thread group is exiting Dave reports that a coredumping workload gets stuck in 5.15-rc2, and identified the culprit in the Fixes line below. The problem is that relying solely on fatal_signal_pending() to gate whether to exit or not fails miserably if a process gets eg SIGILL sent. Don't exclusively rely on fatal signals, also check if the thread group is exiting. Fixes: 15e20db2e0ce ("io-wq: only exit on fatal signals") Reported-by: Dave Chinner Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index c2e0e8e80949..c2360cdc403d 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -584,7 +584,8 @@ loop: if (!get_signal(&ksig)) continue; - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current) || + signal_group_exit(current->signal)) break; continue; } -- cgit v1.2.3 From bd99c71bd14072ce2920f6d0c2fe43df072c653c Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Wed, 22 Sep 2021 18:12:36 +0800 Subject: io_uring: fix race between poll completion and cancel_hash insertion If poll arming and poll completion runs in parallel, there maybe races. For instance, run io_poll_add in iowq and io_poll_task_func in original context, then: iowq original context io_poll_add vfs_poll (interruption happens tw queued to original context) io_poll_task_func generate cqe del from cancel_hash[] if !poll.done insert to cancel_hash[] The entry left in cancel_hash[], similar case for fast poll. Fix it by set poll.done = true when del from cancel_hash[]. Fixes: 5082620fb2ca ("io_uring: terminate multishot poll for CQ ring overflow") Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210922101238.7177-2-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index e372d5b9f6dc..43530aae6180 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5337,10 +5337,8 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) } if (req->poll.events & EPOLLONESHOT) flags = 0; - if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { - req->poll.done = true; + if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) flags = 0; - } if (flags & IORING_CQE_F_MORE) ctx->cq_extra++; @@ -5371,6 +5369,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) if (done) { io_poll_remove_double(req); hash_del(&req->hash_node); + req->poll.done = true; } else { req->result = 0; add_wait_queue(req->poll.head, &req->poll.wait); @@ -5508,6 +5507,7 @@ static void io_async_task_func(struct io_kiocb *req, bool *locked) hash_del(&req->hash_node); io_poll_remove_double(req); + apoll->poll.done = true; spin_unlock(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) -- cgit v1.2.3 From a62682f92eedb41c1cd8290fa875a4b85624fb9a Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Wed, 22 Sep 2021 18:12:37 +0800 Subject: io_uring: fix missing set of EPOLLONESHOT for CQ ring overflow We should set EPOLLONESHOT if cqring_fill_event() returns false since io_poll_add() decides to put req or not by it. Fixes: 5082620fb2ca ("io_uring: terminate multishot poll for CQ ring overflow") Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210922101238.7177-3-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 43530aae6180..ac0c06d5c629 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5337,8 +5337,10 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) } if (req->poll.events & EPOLLONESHOT) flags = 0; - if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) + if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { + req->poll.events |= EPOLLONESHOT; flags = 0; + } if (flags & IORING_CQE_F_MORE) ctx->cq_extra++; -- cgit v1.2.3 From 5b7aa38d86f348847a48f71e9ac7715406de900e Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Wed, 22 Sep 2021 18:12:38 +0800 Subject: io_uring: fix potential req refcount underflow For multishot mode, there may be cases like: iowq original context io_poll_add _arm_poll() mask = vfs_poll() is not 0 if mask (2) io_poll_complete() compl_unlock (interruption happens tw queued to original context) io_poll_task_func() compl_lock (3) done = io_poll_complete() is true compl_unlock put req ref (1) if (poll->flags & EPOLLONESHOT) put req ref EPOLLONESHOT flag in (1) may be from (2) or (3), so there are multiple combinations that can cause ref underfow. Let's address it by: - check the return value in (2) as done - change (1) to if (done) in this way, we only do ref put in (1) if 'oneshot flag' is from (2) - do poll.done check in io_poll_task_func(), so that we won't put ref for the second time. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210922101238.7177-4-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ac0c06d5c629..7707cdb7b372 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5367,6 +5367,10 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } else { bool done; + if (req->poll.done) { + spin_unlock(&ctx->completion_lock); + return; + } done = __io_poll_complete(req, req->result); if (done) { io_poll_remove_double(req); @@ -5830,6 +5834,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; __poll_t mask; + bool done; ipt.pt._qproc = io_poll_queue_proc; @@ -5838,13 +5843,13 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - io_poll_complete(req, mask); + done = io_poll_complete(req, mask); } spin_unlock(&ctx->completion_lock); if (mask) { io_cqring_ev_posted(ctx); - if (poll->events & EPOLLONESHOT) + if (done) io_put_req(req); } return ipt.error; -- cgit v1.2.3 From 8bab4c09f24ec8d4a7a78ab343620f89d3a24804 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Sep 2021 07:12:27 -0600 Subject: io_uring: allow conditional reschedule for intensive iterators If we have a lot of threads and rings, the tctx list can get quite big. This is especially true if we keep creating new threads and rings. Likewise for the provided buffers list. Be nice and insert a conditional reschedule point while iterating the nodes for deletion. Link: https://lore.kernel.org/io-uring/00000000000064b6b405ccb41113@google.com/ Reported-by: syzbot+111d2a03f51f5ae73775@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7707cdb7b372..ef3c94a55fbd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9173,8 +9173,10 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer *buf; unsigned long index; - xa_for_each(&ctx->io_buffers, index, buf) + xa_for_each(&ctx->io_buffers, index, buf) { __io_remove_buffers(ctx, buf, index, -1U); + cond_resched(); + } } static void io_req_cache_free(struct list_head *list) @@ -9672,8 +9674,10 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) struct io_tctx_node *node; unsigned long index; - xa_for_each(&tctx->xa, index, node) + xa_for_each(&tctx->xa, index, node) { io_uring_del_tctx_node(index); + cond_resched(); + } if (wq) { /* * Must be after io_uring_del_task_file() (removes nodes under -- cgit v1.2.3 From 9990da93d2bf9892c2c14c958bef050d4e461a1a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Sep 2021 07:39:08 -0600 Subject: io_uring: put provided buffer meta data under memcg accounting For each provided buffer, we allocate a struct io_buffer to hold the data associated with it. As a large number of buffers can be provided, account that data with memcg. Fixes: ddf0322db79c ("io_uring: add IORING_OP_PROVIDE_BUFFERS") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ef3c94a55fbd..01e49d01fe74 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4387,7 +4387,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) int i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - buf = kmalloc(sizeof(*buf), GFP_KERNEL); + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) break; -- cgit v1.2.3 From cdb31c29d397a8076d81fd1458d091c647ef94ba Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Sep 2021 08:43:54 -0600 Subject: io_uring: don't punt files update to io-wq unconditionally There's no reason to punt it unconditionally, we just need to ensure that the submit lock grabbing is conditional. Fixes: 05f3fb3c5397 ("io_uring: avoid ring quiesce for fixed file set unregister and update") Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 01e49d01fe74..c6139ace11fa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6340,19 +6340,16 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) struct io_uring_rsrc_update2 up; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - up.offset = req->rsrc_update.offset; up.data = req->rsrc_update.arg; up.nr = 0; up.tags = 0; up.resv = 0; - mutex_lock(&ctx->uring_lock); + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, req->rsrc_update.nr_args); - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); if (ret < 0) req_set_fail(req); -- cgit v1.2.3 From 9f3a2cb228c28606895d15f13b30d1f7402dc745 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 24 Sep 2021 17:14:48 +0100 Subject: io_uring: kill extra checks in io_write() We don't retry short writes and so we would never get to async setup in io_write() in that case. Thus ret2 > 0 is always false and iov_iter_advance() is never used. Apparently, the same is found by Coverity, which complains on the code. Fixes: cd65869512ab ("io_uring: use iov_iter state save/restore helpers") Reported-by: Dave Jones Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5b33e61034748ef1022766efc0fb8854cfcf749c.1632500058.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index c6139ace11fa..2b3232d53e79 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3605,7 +3605,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) iov_iter_save_state(iter, state); } req->result = iov_iter_count(iter); - ret2 = 0; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -3670,8 +3669,6 @@ done: } else { copy_iov: iov_iter_restore(iter, state); - if (ret2 > 0) - iov_iter_advance(iter, ret2); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); return ret ?: -EAGAIN; } -- cgit v1.2.3 From 7df778be2f61e1a23002d1f2f5d6aaf702771eb8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 24 Sep 2021 20:04:29 +0100 Subject: io_uring: make OP_CLOSE consistent with direct open From recently open/accept are now able to manipulate fixed file table, but it's inconsistent that close can't. Close the gap, keep API same as with open/accept, i.e. via sqe->file_slot. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b3232d53e79..82f867983bb3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -502,6 +502,7 @@ struct io_poll_update { struct io_close { struct file *file; int fd; + u32 file_slot; }; struct io_timeout_data { @@ -1098,6 +1099,8 @@ static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); +static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static struct kmem_cache *req_cachep; @@ -4591,12 +4594,16 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); + req->close.file_slot = READ_ONCE(sqe->file_index); + if (req->close.file_slot && req->close.fd) + return -EINVAL; + return 0; } @@ -4608,6 +4615,11 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) struct file *file = NULL; int ret = -EBADF; + if (req->close.file_slot) { + ret = io_close_fixed(req, issue_flags); + goto err; + } + spin_lock(&files->file_lock); fdt = files_fdtable(files); if (close->fd >= fdt->max_fds) { @@ -8401,6 +8413,44 @@ err: return ret; } +static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + unsigned int offset = req->close.file_slot - 1; + struct io_ring_ctx *ctx = req->ctx; + struct io_fixed_file *file_slot; + struct file *file; + int ret, i; + + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + ret = -ENXIO; + if (unlikely(!ctx->file_data)) + goto out; + ret = -EINVAL; + if (offset >= ctx->nr_user_files) + goto out; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto out; + + i = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, i); + ret = -EBADF; + if (!file_slot->file_ptr) + goto out; + + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); + if (ret) + goto out; + + file_slot->file_ptr = 0; + io_rsrc_node_switch(ctx, ctx->file_data); + ret = 0; +out: + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + return ret; +} + static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) -- cgit v1.2.3