From 59960b9deb5354e4cdb0b6ed3a3b653a2b4eb602 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 16:36:30 +0300 Subject: io_uring: fix lazy work init Don't leave garbage in req.work before punting async on -EAGAIN in io_iopoll_queue(). [ 140.922099] general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI ... [ 140.922105] RIP: 0010:io_worker_handle_work+0x1db/0x480 ... [ 140.922114] Call Trace: [ 140.922118] ? __next_timer_interrupt+0xe0/0xe0 [ 140.922119] io_wqe_worker+0x2a9/0x360 [ 140.922121] ? _raw_spin_unlock_irqrestore+0x24/0x40 [ 140.922124] kthread+0x12c/0x170 [ 140.922125] ? io_worker_handle_work+0x480/0x480 [ 140.922126] ? kthread_park+0x90/0x90 [ 140.922127] ret_from_fork+0x22/0x30 Fixes: 7cdaf587de7c ("io_uring: avoid whole io_wq_work copy for requests completed inline") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 155f3d830ddb..c04b20bf2803 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1087,6 +1087,7 @@ static inline void io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; } + io_req_init_async(req); io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); -- cgit v1.2.3 From f4c2665e33f48904f2766d644df33fb3fd54b5ec Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:02 +0300 Subject: io-wq: reorder cancellation pending -> running Go all over all pending lists and cancel works there, and only then try to match running requests. No functional changes here, just a preparation for bulk cancellation. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 54 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 0b65a912b036..03c7e37548c2 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -927,19 +927,14 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return ret; } -static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) +static bool io_wqe_cancel_pending_work(struct io_wqe *wqe, + struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; bool found = false; - /* - * First check pending list, if we're lucky we can just remove it - * from there. CANCEL_OK means that the work is returned as-new, - * no completion will be posted for it. - */ spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); @@ -952,21 +947,20 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, } spin_unlock_irqrestore(&wqe->lock, flags); - if (found) { + if (found) io_run_cancel(work, wqe); - return IO_WQ_CANCEL_OK; - } + return found; +} + +static bool io_wqe_cancel_running_work(struct io_wqe *wqe, + struct io_cb_cancel_data *match) +{ + bool found; - /* - * Now check if a free (going busy) or busy worker has the work - * currently running. If we find it there, we'll return CANCEL_RUNNING - * as an indication that we attempt to signal cancellation. The - * completion will run normally in this case. - */ rcu_read_lock(); found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); rcu_read_unlock(); - return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; + return found; } enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, @@ -976,18 +970,34 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, .fn = cancel, .data = data, }; - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; int node; + /* + * First check pending list, if we're lucky we can just remove it + * from there. CANCEL_OK means that the work is returned as-new, + * no completion will be posted for it. + */ for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - ret = io_wqe_cancel_work(wqe, &match); - if (ret != IO_WQ_CANCEL_NOTFOUND) - break; + if (io_wqe_cancel_pending_work(wqe, &match)) + return IO_WQ_CANCEL_OK; } - return ret; + /* + * Now check if a free (going busy) or busy worker has the work + * currently running. If we find it there, we'll return CANCEL_RUNNING + * as an indication that we attempt to signal cancellation. The + * completion will run normally in this case. + */ + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + if (io_wqe_cancel_running_work(wqe, &match)) + return IO_WQ_CANCEL_RUNNING; + } + + return IO_WQ_CANCEL_NOTFOUND; } static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) -- cgit v1.2.3 From 4f26bda1522c35d2701fc219368c7101c17005c1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:03 +0300 Subject: io-wq: add an option to cancel all matched reqs This adds support for cancelling all io-wq works matching a predicate. It isn't used yet, so no change in observable behaviour. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 60 +++++++++++++++++++++++++++++++++-------------------------- fs/io-wq.h | 2 +- fs/io_uring.c | 2 +- 3 files changed, 36 insertions(+), 28 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 03c7e37548c2..e290202d6f64 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -903,13 +903,15 @@ void io_wq_cancel_all(struct io_wq *wq) struct io_cb_cancel_data { work_cancel_fn *fn; void *data; + int nr_running; + int nr_pending; + bool cancel_all; }; static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; unsigned long flags; - bool ret = false; /* * Hold the lock to avoid ->cur_work going out of scope, caller @@ -920,55 +922,55 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && match->fn(worker->cur_work, match->data)) { send_sig(SIGINT, worker->task, 1); - ret = true; + match->nr_running++; } spin_unlock_irqrestore(&worker->lock, flags); - return ret; + return match->nr_running && !match->cancel_all; } -static bool io_wqe_cancel_pending_work(struct io_wqe *wqe, +static void io_wqe_cancel_pending_work(struct io_wqe *wqe, struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; - bool found = false; +retry: spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); + if (!match->fn(work, match->data)) + continue; - if (match->fn(work, match->data)) { - wq_list_del(&wqe->work_list, node, prev); - found = true; - break; - } + wq_list_del(&wqe->work_list, node, prev); + spin_unlock_irqrestore(&wqe->lock, flags); + io_run_cancel(work, wqe); + match->nr_pending++; + if (!match->cancel_all) + return; + + /* not safe to continue after unlock */ + goto retry; } spin_unlock_irqrestore(&wqe->lock, flags); - - if (found) - io_run_cancel(work, wqe); - return found; } -static bool io_wqe_cancel_running_work(struct io_wqe *wqe, +static void io_wqe_cancel_running_work(struct io_wqe *wqe, struct io_cb_cancel_data *match) { - bool found; - rcu_read_lock(); - found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); + io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); rcu_read_unlock(); - return found; } enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data) + void *data, bool cancel_all) { struct io_cb_cancel_data match = { - .fn = cancel, - .data = data, + .fn = cancel, + .data = data, + .cancel_all = cancel_all, }; int node; @@ -980,7 +982,8 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - if (io_wqe_cancel_pending_work(wqe, &match)) + io_wqe_cancel_pending_work(wqe, &match); + if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; } @@ -993,10 +996,15 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - if (io_wqe_cancel_running_work(wqe, &match)) + io_wqe_cancel_running_work(wqe, &match); + if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; } + if (match.nr_running) + return IO_WQ_CANCEL_RUNNING; + if (match.nr_pending) + return IO_WQ_CANCEL_OK; return IO_WQ_CANCEL_NOTFOUND; } @@ -1007,7 +1015,7 @@ static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) { - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork); + return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); } static bool io_wq_pid_match(struct io_wq_work *work, void *data) @@ -1021,7 +1029,7 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) { void *data = (void *) (unsigned long) pid; - return io_wq_cancel_cb(wq, io_wq_pid_match, data); + return io_wq_cancel_cb(wq, io_wq_pid_match, data, false); } struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) diff --git a/fs/io-wq.h b/fs/io-wq.h index 8e138fa88b9f..7d5bd431c5e3 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -130,7 +130,7 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data); + void *data, bool cancel_all); struct task_struct *io_wq_get_task(struct io_wq *wq); diff --git a/fs/io_uring.c b/fs/io_uring.c index c04b20bf2803..94bd88556c1e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4773,7 +4773,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) enum io_wq_cancel cancel_ret; int ret = 0; - cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr); + cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; -- cgit v1.2.3 From 44e728b8aae0bb6d4229129083974f9dea43f50b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:04 +0300 Subject: io_uring: cancel all task's requests on exit If a process is going away, io_uring_flush() will cancel only 1 request with a matching pid. Cancel all of them Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 14 -------------- fs/io-wq.h | 1 - fs/io_uring.c | 14 ++++++++++++-- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index e290202d6f64..47c5f3aeb460 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1018,20 +1018,6 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); } -static bool io_wq_pid_match(struct io_wq_work *work, void *data) -{ - pid_t pid = (pid_t) (unsigned long) data; - - return work->task_pid == pid; -} - -enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) -{ - void *data = (void *) (unsigned long) pid; - - return io_wq_cancel_cb(wq, io_wq_pid_match, data, false); -} - struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; diff --git a/fs/io-wq.h b/fs/io-wq.h index 7d5bd431c5e3..b72538fe5afd 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -125,7 +125,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work) void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); -enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/fs/io_uring.c b/fs/io_uring.c index 94bd88556c1e..ad5128a40c14 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7424,6 +7424,13 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } +static bool io_cancel_pid_cb(struct io_wq_work *work, void *data) +{ + pid_t pid = (pid_t) (unsigned long) data; + + return work->task_pid == pid; +} + static int io_uring_flush(struct file *file, void *data) { struct io_ring_ctx *ctx = file->private_data; @@ -7433,8 +7440,11 @@ static int io_uring_flush(struct file *file, void *data) /* * If the task is going away, cancel work it may have pending */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current)); + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { + void *data = (void *) (unsigned long)task_pid_vnr(current); + + io_wq_cancel_cb(ctx->io_wq, io_cancel_pid_cb, data, true); + } return 0; } -- cgit v1.2.3 From 67c4d9e693e3bb7fb968af24e3584f821a78ba56 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:24:05 +0300 Subject: io_uring: batch cancel in io_uring_cancel_files() Instead of waiting for each request one by one, first try to cancel all of them in a batched manner, and then go over inflight_list/etc to reap leftovers. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index ad5128a40c14..b6bcd5a7f4bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7366,9 +7366,22 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } +static bool io_wq_files_match(struct io_wq_work *work, void *data) +{ + struct files_struct *files = data; + + return work->files == files; +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { + if (list_empty_careful(&ctx->inflight_list)) + return; + + /* cancel all at once, should be faster than doing it one by one*/ + io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); + while (!list_empty_careful(&ctx->inflight_list)) { struct io_kiocb *cancel_req = NULL, *req; DEFINE_WAIT(wait); -- cgit v1.2.3 From 4dd2824d6d5914949b5fe589538bc2622d84c5dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:33:13 +0300 Subject: io_uring: lazy get task There will be multiple places where req->task is used, so refcount-pin it lazily with introduced *io_{get,put}_req_task(). We need to always have valid ->task for cancellation reasons, but don't care about pinning it in some cases. That's why it sets req->task in io_req_init() and implements get/put laziness with a flag. This also removes using @current from polling io_arm_poll_handler(), etc., but doesn't change observable behaviour. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b6bcd5a7f4bc..5f946eb8b740 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -541,6 +541,7 @@ enum { REQ_F_NO_FILE_TABLE_BIT, REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, + REQ_F_TASK_PINNED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -598,6 +599,8 @@ enum { REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), + /* req->task is refcounted */ + REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), }; struct async_poll { @@ -910,6 +913,21 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static void io_get_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + return; + get_task_struct(req->task); + req->flags |= REQ_F_TASK_PINNED; +} + +/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ +static void __io_put_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + put_task_struct(req->task); +} + static void io_file_put_work(struct work_struct *work); /* @@ -1399,9 +1417,7 @@ static void __io_req_aux_free(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - if (req->task) - put_task_struct(req->task); - + __io_put_req_task(req); io_req_work_drop_env(req); } @@ -4367,8 +4383,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) memcpy(&apoll->work, &req->work, sizeof(req->work)); had_io = req->io != NULL; - get_task_struct(current); - req->task = current; + io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4556,8 +4571,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - get_task_struct(current); - req->task = current; + io_get_req_task(req); return 0; } @@ -5818,7 +5832,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->flags = 0; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); - req->task = NULL; + req->task = current; req->result = 0; if (unlikely(req->opcode >= IORING_OP_LAST)) -- cgit v1.2.3 From 801dd57bd1d8c2c253f43635a3045bfa32a810b3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Jun 2020 10:33:14 +0300 Subject: io_uring: cancel by ->task not pid For an exiting process it tries to cancel all its inflight requests. Use req->task to match such instead of work.pid. We always have req->task set, and it will be valid because we're matching only current exiting task. Also, remove work.pid and everything related, it's useless now. Reported-by: Eric W. Biederman Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.h | 1 - fs/io_uring.c | 16 ++++++---------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/io-wq.h b/fs/io-wq.h index b72538fe5afd..071f1a997800 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -90,7 +90,6 @@ struct io_wq_work { const struct cred *creds; struct fs_struct *fs; unsigned flags; - pid_t task_pid; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5f946eb8b740..e17df662c191 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1063,8 +1063,6 @@ static inline void io_req_work_grab_env(struct io_kiocb *req, } spin_unlock(¤t->fs->lock); } - if (!req->work.task_pid) - req->work.task_pid = task_pid_vnr(current); } static inline void io_req_work_drop_env(struct io_kiocb *req) @@ -7451,11 +7449,12 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } -static bool io_cancel_pid_cb(struct io_wq_work *work, void *data) +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { - pid_t pid = (pid_t) (unsigned long) data; + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct task_struct *task = data; - return work->task_pid == pid; + return req->task == task; } static int io_uring_flush(struct file *file, void *data) @@ -7467,11 +7466,8 @@ static int io_uring_flush(struct file *file, void *data) /* * If the task is going away, cancel work it may have pending */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { - void *data = (void *) (unsigned long)task_pid_vnr(current); - - io_wq_cancel_cb(ctx->io_wq, io_cancel_pid_cb, data, true); - } + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); return 0; } -- cgit v1.2.3 From 2d7d67920e5c8e0854df23ca77da2dd5880ce5dd Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 16 Jun 2020 02:06:37 +0800 Subject: io_uring: don't fail links for EAGAIN error in IOPOLL mode In IOPOLL mode, for EAGAIN error, we'll try to submit io request again using io-wq, so don't fail rest of links if this io request has links. Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e17df662c191..eb3797714539 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1988,7 +1988,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (res != req->result) + if (res != -EAGAIN && res != req->result) req_set_fail_links(req); req->result = res; if (res != -EAGAIN) -- cgit v1.2.3 From bbde017a32b32d2fa8d5fddca25fade20132abf8 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 16 Jun 2020 02:06:38 +0800 Subject: io_uring: add memory barrier to synchronize io_kiocb's result and iopoll_completed In io_complete_rw_iopoll(), stores to io_kiocb's result and iopoll completed are two independent store operations, to ensure that once iopoll_completed is ture and then req->result must been perceived by the cpu executing io_do_iopoll(), proper memory barrier should be used. And in io_do_iopoll(), we check whether req->result is EAGAIN, if it is, we'll need to issue this io request using io-wq again. In order to just issue a single smp_rmb() on the completion side, move the re-submit work to io_iopoll_complete(). Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang [axboe: don't set ->iopoll_completed for -EAGAIN retry] Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index eb3797714539..9d2ae9aa8b45 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1742,6 +1742,18 @@ static int io_put_kbuf(struct io_kiocb *req) return cflags; } +static void io_iopoll_queue(struct list_head *again) +{ + struct io_kiocb *req; + + do { + req = list_first_entry(again, struct io_kiocb, list); + list_del(&req->list); + refcount_inc(&req->refs); + io_queue_async_work(req); + } while (!list_empty(again)); +} + /* * Find and free completed poll iocbs */ @@ -1750,12 +1762,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct req_batch rb; struct io_kiocb *req; + LIST_HEAD(again); + + /* order with ->result store in io_complete_rw_iopoll() */ + smp_rmb(); rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { int cflags = 0; req = list_first_entry(done, struct io_kiocb, list); + if (READ_ONCE(req->result) == -EAGAIN) { + req->iopoll_completed = 0; + list_move_tail(&req->list, &again); + continue; + } list_del(&req->list); if (req->flags & REQ_F_BUFFER_SELECTED) @@ -1773,18 +1794,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_ev_posted(ctx); io_free_req_many(ctx, &rb); -} - -static void io_iopoll_queue(struct list_head *again) -{ - struct io_kiocb *req; - do { - req = list_first_entry(again, struct io_kiocb, list); - list_del(&req->list); - refcount_inc(&req->refs); - io_queue_async_work(req); - } while (!list_empty(again)); + if (!list_empty(&again)) + io_iopoll_queue(&again); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -1792,7 +1804,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct io_kiocb *req, *tmp; LIST_HEAD(done); - LIST_HEAD(again); bool spin; int ret; @@ -1818,13 +1829,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) break; - if (req->result == -EAGAIN) { - list_move_tail(&req->list, &again); - continue; - } - if (!list_empty(&again)) - break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); if (ret < 0) break; @@ -1837,9 +1841,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) io_iopoll_complete(ctx, nr_events, &done); - if (!list_empty(&again)) - io_iopoll_queue(&again); - return ret; } @@ -1990,9 +1991,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (res != -EAGAIN && res != req->result) req_set_fail_links(req); - req->result = res; - if (res != -EAGAIN) + + WRITE_ONCE(req->result, res); + /* order with io_poll_complete() checking ->result */ + if (res != -EAGAIN) { + smp_wmb(); WRITE_ONCE(req->iopoll_completed, 1); + } } /* -- cgit v1.2.3 From 9d8426a09195e2dcf2aa249de2aaadd792d491c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jun 2020 18:42:49 -0600 Subject: io_uring: acquire 'mm' for task_work for SQPOLL If we're unlucky with timing, we could be running task_work after having dropped the memory context in the sq thread. Since dropping the context requires a runnable task state, we cannot reliably drop it as part of our check-for-work loop in io_sq_thread(). Instead, abstract out the mm acquire for the sq thread into a helper, and call it from the async task work handler. Cc: stable@vger.kernel.org # v5.7 Signed-off-by: Jens Axboe --- fs/io_uring.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d2ae9aa8b45..98c83fbf4f88 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4256,6 +4256,28 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&pt->req->apoll->poll, pt, head); } +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); @@ -4290,11 +4312,16 @@ static void io_async_task_func(struct callback_head *cb) if (!canceled) { __set_current_state(TASK_RUNNING); + if (io_sq_thread_acquire_mm(ctx, req)) { + io_cqring_add_event(req, -EFAULT); + goto end_req; + } mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); } else { io_cqring_ev_posted(ctx); +end_req: req_set_fail_links(req); io_double_put_req(req); } @@ -5841,11 +5868,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } + if (unlikely(io_sq_thread_acquire_mm(ctx, req))) + return -EFAULT; sqe_flags = READ_ONCE(sqe->flags); /* enforce forwards compatibility on users */ @@ -5954,16 +5978,6 @@ fail_req: return submitted; } -static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; -- cgit v1.2.3 From 56952e91acc93ed624fe9da840900defb75f1323 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 17 Jun 2020 15:00:04 -0600 Subject: io_uring: reap poll completions while waiting for refs to drop on exit If we're doing polled IO and end up having requests being submitted async, then completions can come in while we're waiting for refs to drop. We need to reap these manually, as nobody else will be looking for them. Break the wait into 1/20th of a second time waits, and check for done poll completions if we time out. Otherwise we can have done poll completions sitting in ctx->poll_list, which needs us to reap them but we're just waiting for them. Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 98c83fbf4f88..2038d52c5450 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7363,7 +7363,17 @@ static void io_ring_exit_work(struct work_struct *work) if (ctx->rings) io_cqring_overflow_flush(ctx, true); - wait_for_completion(&ctx->ref_comp); + /* + * If we're doing polled IO and end up having requests being + * submitted async (out-of-line), then completions can come in while + * we're waiting for refs to drop. We need to reap these manually, + * as nobody else will be looking for them. + */ + while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { + io_iopoll_reap_events(ctx); + if (ctx->rings) + io_cqring_overflow_flush(ctx, true); + } io_ring_ctx_free(ctx); } -- cgit v1.2.3 From 6f2cc1664db20676069cff27a461ccc97dbfd114 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 18 Jun 2020 15:01:56 +0800 Subject: io_uring: fix possible race condition against REQ_F_NEED_CLEANUP In io_read() or io_write(), when io request is submitted successfully, it'll go through the below sequence: kfree(iovec); req->flags &= ~REQ_F_NEED_CLEANUP; return ret; But clearing REQ_F_NEED_CLEANUP might be unsafe. The io request may already have been completed, and then io_complete_rw_iopoll() and io_complete_rw() will be called, both of which will also modify req->flags if needed. This causes a race condition, with concurrent non-atomic modification of req->flags. To eliminate this race, in io_read() or io_write(), if io request is submitted successfully, we don't remove REQ_F_NEED_CLEANUP flag. If REQ_F_NEED_CLEANUP is set, we'll leave __io_req_aux_free() to the iovec cleanup work correspondingly. Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2038d52c5450..a78201b96179 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2670,8 +2670,8 @@ copy_iov: } } out_free: - kfree(iovec); - req->flags &= ~REQ_F_NEED_CLEANUP; + if (!(req->flags & REQ_F_NEED_CLEANUP)) + kfree(iovec); return ret; } @@ -2793,8 +2793,8 @@ copy_iov: } } out_free: - req->flags &= ~REQ_F_NEED_CLEANUP; - kfree(iovec); + if (!(req->flags & REQ_F_NEED_CLEANUP)) + kfree(iovec); return ret; } -- cgit v1.2.3