From 931147ddfa6e9ffd814272f1c0370c4740acbe17 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 24 Nov 2022 01:35:54 -0800 Subject: io_uring: allow defer completion for aux posted cqes Multishot ops cannot use the compl_reqs list as the request must stay in the poll list, but that means they need to run each completion without benefiting from batching. Here introduce batching infrastructure for only small (ie 16 byte) CQEs. This restriction is ok because there are no use cases posting 32 byte CQEs. In the ring keep a batch of up to 16 posted results, and flush in the same way as compl_reqs. 16 was chosen through experimentation on a microbenchmark ([1]), as well as trying not to increase the size of the ring too much. This increases the size to 1472 bytes from 1216. [1]: https://github.com/DylanZA/liburing/commit/9ac66b36bcf4477bfafeff1c5f107896b7ae31cf Run with $ make -j && ./benchmark/reg.b -s 1 -t 2000 -r 10 Gives results: baseline 8309 k/s 8 18807 k/s 16 19338 k/s 32 20134 k/s Suggested-by: Pavel Begunkov Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20221124093559.3780686-5-dylany@meta.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/io_uring_types.h') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f5b687a787a3..accdfecee953 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -174,7 +174,9 @@ struct io_submit_state { bool plug_started; bool need_plug; unsigned short submit_nr; + unsigned int cqes_count; struct blk_plug plug; + struct io_uring_cqe cqes[16]; }; struct io_ev_fd { -- cgit v1.2.3 From e6aeb2721d3bad8379c43644d0380908e93b0187 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:30 +0000 Subject: io_uring: complete all requests in task context This patch adds ctx->task_complete flag. If set, we'll complete all requests in the context of the original task. Note, this extends to completion CQE posting only but not io_kiocb cleanup / free, e.g. io-wq may free the requests in the free calllback. This flag will be used later for optimisations purposes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/21ece72953f76bb2e77659a72a14326227ab6460.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 2 ++ include/linux/io_uring_types.h | 2 ++ io_uring/io_uring.c | 14 +++++++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux/io_uring_types.h') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 29e519752da4..934e5dd4ccc0 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -11,6 +11,8 @@ enum io_uring_cmd_flags { IO_URING_F_UNLOCKED = 2, /* the request is executed from poll, it should not be freed */ IO_URING_F_MULTISHOT = 4, + /* executed by io-wq */ + IO_URING_F_IOWQ = 8, /* int's last bit, sign checks are usually faster than a bit test */ IO_URING_F_NONBLOCK = INT_MIN, diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index accdfecee953..6be1e1359c89 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -208,6 +208,8 @@ struct io_ring_ctx { unsigned int drain_disabled: 1; unsigned int has_evfd: 1; unsigned int syscall_iopoll: 1; + /* all CQEs should be posted only by the submitter task */ + unsigned int task_complete: 1; } ____cacheline_aligned_in_smp; /* submission data */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ba5ef1e675c1..13d56472dd49 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -921,8 +921,11 @@ static void __io_req_complete_post(struct io_kiocb *req) void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { - if (!(issue_flags & IO_URING_F_UNLOCKED) || - !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + if (req->ctx->task_complete && (issue_flags & IO_URING_F_IOWQ)) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + } else if (!(issue_flags & IO_URING_F_UNLOCKED) || + !(req->ctx->flags & IORING_SETUP_IOPOLL)) { __io_req_complete_post(req); } else { struct io_ring_ctx *ctx = req->ctx; @@ -1830,7 +1833,7 @@ void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); const struct io_op_def *def = &io_op_defs[req->opcode]; - unsigned int issue_flags = IO_URING_F_UNLOCKED; + unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; @@ -3490,6 +3493,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (!ctx) return -ENOMEM; + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && + !(ctx->flags & IORING_SETUP_IOPOLL) && + !(ctx->flags & IORING_SETUP_SQPOLL)) + ctx->task_complete = true; + /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events -- cgit v1.2.3 From d34b1b0b6779d4f5ee877b53cad90eef0f1cbe34 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:32 +0000 Subject: io_uring: use tw for putting rsrc Use task_work for completing rsrc removals, it'll be needed later for spinlock optimisations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cbba5d53a11ee6fc2194dacea262c1d733c8b529.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 1 + io_uring/rsrc.c | 19 +++++++++++++++++-- io_uring/rsrc.h | 1 + 4 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux/io_uring_types.h') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 6be1e1359c89..dcd8a563ab52 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -328,6 +328,7 @@ struct io_ring_ctx { struct io_rsrc_data *buf_data; struct delayed_work rsrc_put_work; + struct callback_head rsrc_put_tw; struct llist_head rsrc_put_llist; struct list_head rsrc_ref_list; spinlock_t rsrc_ref_lock; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 13d56472dd49..a29cbf973287 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -326,6 +326,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); + init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw); init_llist_head(&ctx->rsrc_put_llist); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d25309400a45..18de10c68a15 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -204,6 +204,14 @@ void io_rsrc_put_work(struct work_struct *work) } } +void io_rsrc_put_tw(struct callback_head *cb) +{ + struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, + rsrc_put_tw); + + io_rsrc_put_work(&ctx->rsrc_put_work.work); +} + void io_wait_rsrc_data(struct io_rsrc_data *data) { if (data && !atomic_dec_and_test(&data->refs)) @@ -242,8 +250,15 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) } spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); - if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); + if (!first_add) + return; + + if (ctx->submitter_task) { + if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw, + ctx->notify_method)) + return; + } + mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); } static struct io_rsrc_node *io_rsrc_node_alloc(void) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 81445a477622..2b8743645efc 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -53,6 +53,7 @@ struct io_mapped_ubuf { struct bio_vec bvec[]; }; +void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_put_work(struct work_struct *work); void io_rsrc_refs_refill(struct io_ring_ctx *ctx); void io_wait_rsrc_data(struct io_rsrc_data *data); -- cgit v1.2.3