From d55e5f5b70dd6214ef81fb2313121b72a7dd2200 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 11 Dec 2019 16:12:15 -0700 Subject: io_uring: use u64_to_user_ptr() consistently We use it in some spots, but not consistently. Convert the rest over, makes it easier to read as well. No functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f084e3cf835..7a23d2351be2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2157,7 +2157,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) unsigned flags; flags = READ_ONCE(sqe->msg_flags); - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); io->msg.iov = io->msg.fast_iov; return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); #else @@ -2239,7 +2239,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) unsigned flags; flags = READ_ONCE(sqe->msg_flags); - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); io->msg.iov = io->msg.fast_iov; return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, &io->msg.iov); @@ -2273,8 +2273,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, else if (force_nonblock) flags |= MSG_DONTWAIT; - msg = (struct user_msghdr __user *) (unsigned long) - READ_ONCE(sqe->addr); + msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); if (req->io) { kmsg = &req->io->msg; kmsg->msg.msg_name = &addr; @@ -2331,9 +2330,8 @@ static int io_accept_prep(struct io_kiocb *req) if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; - accept->addr = (struct sockaddr __user *) - (unsigned long) READ_ONCE(sqe->addr); - accept->addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); + accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); req->flags |= REQ_F_PREPPED; return 0; @@ -2407,7 +2405,7 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) struct sockaddr __user *addr; int addr_len; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); addr_len = READ_ONCE(sqe->addr2); return move_addr_to_kernel(addr, addr_len, &io->connect.address); #else @@ -4702,7 +4700,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) return -EFAULT; - dst->iov_base = (void __user *) (unsigned long) ciov.iov_base; + dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); dst->iov_len = ciov.iov_len; return 0; } -- cgit v1.2.3 From 9adbd45d6d32ffc1a03f3c51d72cfc69ebfc2ddb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2019 08:45:55 -0700 Subject: io_uring: add and use struct io_rw for read/writes Put the kiocb in struct io_rw, and add the addr/len for the request as well. Use the kiocb->private field for the buffer index for fixed reads and writes. Any use of kiocb->ki_filp is flipped to req->file. It's the same thing, and less confusing. Signed-off-by: Jens Axboe --- fs/io_uring.c | 96 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7a23d2351be2..b5f91d21fd04 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -332,6 +332,13 @@ struct io_timeout { int flags; }; +struct io_rw { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct kiocb kiocb; + u64 addr; + u64 len; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -369,7 +376,7 @@ struct io_async_ctx { struct io_kiocb { union { struct file *file; - struct kiocb rw; + struct io_rw rw; struct io_poll_iocb poll; struct io_accept accept; struct io_sync sync; @@ -1180,7 +1187,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, ret = 0; list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; /* * Move completed entries to our local list. If we find a @@ -1335,7 +1342,7 @@ static inline void req_set_fail_links(struct io_kiocb *req) static void io_complete_rw_common(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); @@ -1347,7 +1354,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); io_complete_rw_common(kiocb, res); io_put_req(req); @@ -1355,7 +1362,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_kiocb *nxt = NULL; io_complete_rw_common(kiocb, res); @@ -1366,7 +1373,7 @@ static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); @@ -1400,7 +1407,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, list); - if (list_req->rw.ki_filp != req->rw.ki_filp) + if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -1475,7 +1482,7 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) { const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; unsigned ioprio; int ret; @@ -1524,6 +1531,12 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) return -EINVAL; kiocb->ki_complete = io_complete_rw; } + + req->rw.addr = READ_ONCE(req->sqe->addr); + req->rw.len = READ_ONCE(req->sqe->len); + /* we own ->private, reuse it for the buffer index */ + req->rw.kiocb.private = (void *) (unsigned long) + READ_ONCE(req->sqe->buf_index); return 0; } @@ -1557,11 +1570,11 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, io_rw_done(kiocb, ret); } -static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, - const struct io_uring_sqe *sqe, +static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) { - size_t len = READ_ONCE(sqe->len); + struct io_ring_ctx *ctx = req->ctx; + size_t len = req->rw.len; struct io_mapped_ubuf *imu; unsigned index, buf_index; size_t offset; @@ -1571,13 +1584,13 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, if (unlikely(!ctx->user_bufs)) return -EFAULT; - buf_index = READ_ONCE(sqe->buf_index); + buf_index = (unsigned long) req->rw.kiocb.private; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = &ctx->user_bufs[index]; - buf_addr = READ_ONCE(sqe->addr); + buf_addr = req->rw.addr; /* overflow */ if (buf_addr + len < buf_addr) @@ -1634,25 +1647,20 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter) { - const struct io_uring_sqe *sqe = req->sqe; - void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - size_t sqe_len = READ_ONCE(sqe->len); + void __user *buf = u64_to_user_ptr(req->rw.addr); + size_t sqe_len = req->rw.len; u8 opcode; - /* - * We're reading ->opcode for the second time, but the first read - * doesn't care whether it's _FIXED or not, so it doesn't matter - * whether ->opcode changes concurrently. The first read does care - * about whether it is a READ or a WRITE, so we don't trust this read - * for that purpose and instead let the caller pass in the read/write - * flag. - */ opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; - return io_import_fixed(req->ctx, rw, sqe, iter); + return io_import_fixed(req, rw, iter); } + /* buffer index only valid with fixed read/write */ + if (req->rw.kiocb.private) + return -EINVAL; + if (req->io) { struct io_async_rw *iorw = &req->io->rw; @@ -1801,9 +1809,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; - struct file *file; size_t iov_count; ssize_t io_size, ret; @@ -1819,9 +1826,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.ki_flags &= ~IOCB_NOWAIT; + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - file = req->file; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -1830,20 +1836,20 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(file)) { + if (force_nonblock && !io_file_supports_async(req->file)) { req->flags |= REQ_F_MUST_PUNT; goto copy_iov; } iov_count = iov_iter_count(&iter); - ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; - if (file->f_op->read_iter) - ret2 = call_read_iter(file, kiocb, &iter); + if (req->file->f_op->read_iter) + ret2 = call_read_iter(req->file, kiocb, &iter); else - ret2 = loop_rw_iter(READ, file, kiocb, &iter); + ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); /* * In case of a short read, punt to async. This can happen @@ -1894,9 +1900,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; - struct file *file; size_t iov_count; ssize_t ret, io_size; @@ -1912,9 +1917,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, /* Ensure we clear previously set non-block flag */ if (!force_nonblock) - req->rw.ki_flags &= ~IOCB_NOWAIT; + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - file = kiocb->ki_filp; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -1934,7 +1938,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, goto copy_iov; iov_count = iov_iter_count(&iter); - ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1946,17 +1950,17 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, * we return to userspace. */ if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(file)->i_sb, + __sb_start_write(file_inode(req->file)->i_sb, SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(file)->i_sb, + __sb_writers_release(file_inode(req->file)->i_sb, SB_FREEZE_WRITE); } kiocb->ki_flags |= IOCB_WRITE; - if (file->f_op->write_iter) - ret2 = call_write_iter(file, kiocb, &iter); + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, &iter); else - ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); } else { @@ -2036,7 +2040,7 @@ static void io_fsync_finish(struct io_wq_work **workptr) if (io_req_cancelled(req)) return; - ret = vfs_fsync_range(req->rw.ki_filp, req->sync.off, + ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) @@ -2102,7 +2106,7 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr) if (io_req_cancelled(req)) return; - ret = sync_file_range(req->rw.ki_filp, req->sync.off, req->sync.len, + ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) req_set_fail_links(req); -- cgit v1.2.3 From 3fbb51c18f5c15a23db74c4da79d3d035176c480 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2019 08:51:52 -0700 Subject: io_uring: move all prep state for IORING_OP_CONNECT to prep handler Add struct io_connect in our io_kiocb per-command union, and ensure that io_connect_prep() has grabbed what it needs from the SQE. Signed-off-by: Jens Axboe --- fs/io_uring.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b5f91d21fd04..2a173f54ec8e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -339,6 +339,12 @@ struct io_rw { u64 len; }; +struct io_connect { + struct file *file; + struct sockaddr __user *addr; + int addr_len; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -382,6 +388,7 @@ struct io_kiocb { struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; + struct io_connect connect; }; const struct io_uring_sqe *sqe; @@ -2406,14 +2413,18 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) const struct io_uring_sqe *sqe = req->sqe; - struct sockaddr __user *addr; - int addr_len; - addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - addr_len = READ_ONCE(sqe->addr2); - return move_addr_to_kernel(addr, addr_len, &io->connect.address); + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + req->connect.addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->connect.addr_len = READ_ONCE(sqe->addr2); + return move_addr_to_kernel(req->connect.addr, req->connect.addr_len, + &io->connect.address); #else - return 0; + return -EOPNOTSUPP; #endif } @@ -2421,18 +2432,9 @@ static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_async_ctx __io, *io; unsigned file_flags; - int addr_len, ret; - - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) - return -EINVAL; - - addr_len = READ_ONCE(sqe->addr2); - file_flags = force_nonblock ? O_NONBLOCK : 0; + int ret; if (req->io) { io = req->io; @@ -2443,8 +2445,10 @@ static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, io = &__io; } - ret = __sys_connect_file(req->file, &io->connect.address, addr_len, - file_flags); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_connect_file(req->file, &io->connect.address, + req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { if (req->io) return -EAGAIN; -- cgit v1.2.3 From e47293fdf98998292a89d516c8f7b8b9eb5c5213 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2019 08:58:21 -0700 Subject: io_uring: move all prep state for IORING_OP_{SEND,RECV}_MGS to prep handler Add struct io_sr_msg in our io_kiocb per-command union, and ensure that the send/recvmsg prep handlers have grabbed what they need from the SQE by the time prep is done. Signed-off-by: Jens Axboe --- fs/io_uring.c | 64 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a173f54ec8e..89e5b19044cc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -345,6 +345,12 @@ struct io_connect { int addr_len; }; +struct io_sr_msg { + struct file *file; + struct user_msghdr __user *msg; + int msg_flags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -389,6 +395,7 @@ struct io_kiocb { struct io_cancel cancel; struct io_timeout timeout; struct io_connect connect; + struct io_sr_msg sr_msg; }; const struct io_uring_sqe *sqe; @@ -2164,15 +2171,15 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) const struct io_uring_sqe *sqe = req->sqe; - struct user_msghdr __user *msg; - unsigned flags; + struct io_sr_msg *sr = &req->sr_msg; - flags = READ_ONCE(sqe->msg_flags); - msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->msg_flags = READ_ONCE(sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); io->msg.iov = io->msg.fast_iov; - return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); + return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + &io->msg.iov); #else - return 0; + return -EOPNOTSUPP; #endif } @@ -2180,7 +2187,6 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2194,12 +2200,6 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, struct sockaddr_storage addr; unsigned flags; - flags = READ_ONCE(sqe->msg_flags); - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - if (req->io) { kmsg = &req->io->msg; kmsg->msg.msg_name = &addr; @@ -2215,6 +2215,12 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, goto out; } + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (force_nonblock && ret == -EAGAIN) { if (req->io) @@ -2245,17 +2251,15 @@ out: static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - struct user_msghdr __user *msg; - unsigned flags; + struct io_sr_msg *sr = &req->sr_msg; - flags = READ_ONCE(sqe->msg_flags); - msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->msg_flags = READ_ONCE(req->sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(req->sqe->addr)); io->msg.iov = io->msg.fast_iov; - return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, - &io->msg.iov); + return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + &io->msg.uaddr, &io->msg.iov); #else - return 0; + return -EOPNOTSUPP; #endif } @@ -2263,7 +2267,6 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2273,18 +2276,10 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, sock = sock_from_file(req->file, &ret); if (sock) { - struct user_msghdr __user *msg; struct io_async_ctx io; struct sockaddr_storage addr; unsigned flags; - flags = READ_ONCE(sqe->msg_flags); - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); if (req->io) { kmsg = &req->io->msg; kmsg->msg.msg_name = &addr; @@ -2300,7 +2295,14 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, goto out; } - ret = __sys_recvmsg_sock(sock, &kmsg->msg, msg, kmsg->uaddr, flags); + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, + kmsg->uaddr, flags); if (force_nonblock && ret == -EAGAIN) { if (req->io) return -EAGAIN; -- cgit v1.2.3 From 26a61679f10c6f041726411964b172565021c2eb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2019 09:02:01 -0700 Subject: io_uring: read 'count' for IORING_OP_TIMEOUT in prep handler Add the count field to struct io_timeout, and ensure the prep handler has read it. Timeout also needs an async context always, set it up in the prep handler if we don't have one. Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 89e5b19044cc..7e8d28750053 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -330,6 +330,7 @@ struct io_timeout { struct file *file; u64 addr; int flags; + unsigned count; }; struct io_rw { @@ -2902,7 +2903,12 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - data = &io->timeout; + req->timeout.count = READ_ONCE(sqe->off); + + if (!io && io_alloc_async_ctx(req)) + return -ENOMEM; + + data = &req->io->timeout; data->req = req; req->flags |= REQ_F_TIMEOUT; @@ -2920,7 +2926,6 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, static int io_timeout(struct io_kiocb *req) { - const struct io_uring_sqe *sqe = req->sqe; unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; @@ -2942,7 +2947,7 @@ static int io_timeout(struct io_kiocb *req) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - count = READ_ONCE(sqe->off); + count = req->timeout.count; if (!count) { req->flags |= REQ_F_TIMEOUT_NOSEQ; spin_lock_irq(&ctx->completion_lock); -- cgit v1.2.3 From 06b76d44ba25e52711dc7cc4fc75b50907bc6b8e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Dec 2019 14:44:26 -0700 Subject: io_uring: standardize the prep methods We currently have a mix of use cases. Most of the newer ones are pretty uniform, but we have some older ones that use different calling calling conventions. This is confusing. For the opcodes that currently rely on the req->io->sqe copy saving them from reuse, add a request type struct in the io_kiocb command union to store the data they need. Prepare for all opcodes having a standard prep method, so we can call it in a uniform fashion and outside of the opcode handler. This is in preparation for passing in the 'sqe' pointer, rather than storing it in the io_kiocb. Once we have uniform prep handlers, we can leave all the prep work to that part, and not even pass in the sqe to the opcode handler. This ensures that we don't reuse sqe data inadvertently. Signed-off-by: Jens Axboe --- fs/io_uring.c | 128 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 63 insertions(+), 65 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7e8d28750053..2cdfbb451fe2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -371,7 +371,6 @@ struct io_async_rw { }; struct io_async_ctx { - struct io_uring_sqe sqe; union { struct io_async_rw rw; struct io_async_msghdr msg; @@ -433,7 +432,6 @@ struct io_kiocb { #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ #define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ -#define REQ_F_PREPPED 131072 /* request already opcode prepared */ u64 user_data; u32 result; u32 sequence; @@ -1501,6 +1499,8 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) unsigned ioprio; int ret; + if (!sqe) + return 0; if (!req->file) return -EBADF; @@ -1552,6 +1552,7 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) /* we own ->private, reuse it for the buffer index */ req->rw.kiocb.private = (void *) (unsigned long) READ_ONCE(req->sqe->buf_index); + req->sqe = NULL; return 0; } @@ -1773,13 +1774,7 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, static int io_alloc_async_ctx(struct io_kiocb *req) { req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - if (req->io) { - memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); - req->sqe = &req->io->sqe; - return 0; - } - - return 1; + return req->io == NULL; } static void io_rw_async(struct io_wq_work **workptr) @@ -1810,12 +1805,14 @@ static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, { ssize_t ret; - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + if (req->sqe) { + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; - if (unlikely(!(req->file->f_mode & FMODE_READ))) - return -EBADF; + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; + } return io_import_iovec(READ, req, iovec, iter); } @@ -1829,15 +1826,9 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t io_size, ret; - if (!req->io) { - ret = io_read_prep(req, &iovec, &iter, force_nonblock); - if (ret < 0) - return ret; - } else { - ret = io_import_iovec(READ, req, &iovec, &iter); - if (ret < 0) - return ret; - } + ret = io_read_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -1901,12 +1892,14 @@ static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, { ssize_t ret; - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + if (req->sqe) { + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) - return -EBADF; + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; + } return io_import_iovec(WRITE, req, iovec, iter); } @@ -1920,15 +1913,9 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t ret, io_size; - if (!req->io) { - ret = io_write_prep(req, &iovec, &iter, force_nonblock); - if (ret < 0) - return ret; - } else { - ret = io_import_iovec(WRITE, req, &iovec, &iter); - if (ret < 0) - return ret; - } + ret = io_write_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) @@ -2013,7 +2000,7 @@ static int io_prep_fsync(struct io_kiocb *req) const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_PREPPED) + if (!req->sqe) return 0; if (!req->file) return -EBADF; @@ -2029,7 +2016,7 @@ static int io_prep_fsync(struct io_kiocb *req) req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->len); - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; } @@ -2095,7 +2082,7 @@ static int io_prep_sfr(struct io_kiocb *req) const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (!req->file) return -EBADF; @@ -2108,7 +2095,7 @@ static int io_prep_sfr(struct io_kiocb *req) req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->len); req->sync.flags = READ_ONCE(sqe->sync_range_flags); - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; } @@ -2173,12 +2160,17 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) #if defined(CONFIG_NET) const struct io_uring_sqe *sqe = req->sqe; struct io_sr_msg *sr = &req->sr_msg; + int ret; + if (!sqe) + return 0; sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); io->msg.iov = io->msg.fast_iov; - return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.iov); + req->sqe = NULL; + return ret; #else return -EOPNOTSUPP; #endif @@ -2253,12 +2245,18 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; + int ret; + + if (!req->sqe) + return 0; sr->msg_flags = READ_ONCE(req->sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(req->sqe->addr)); io->msg.iov = io->msg.fast_iov; - return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.uaddr, &io->msg.iov); + req->sqe = NULL; + return ret; #else return -EOPNOTSUPP; #endif @@ -2336,7 +2334,7 @@ static int io_accept_prep(struct io_kiocb *req) const struct io_uring_sqe *sqe = req->sqe; struct io_accept *accept = &req->accept; - if (req->flags & REQ_F_PREPPED) + if (!req->sqe) return 0; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) @@ -2347,7 +2345,7 @@ static int io_accept_prep(struct io_kiocb *req) accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; #else return -EOPNOTSUPP; @@ -2416,7 +2414,10 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) const struct io_uring_sqe *sqe = req->sqe; + int ret; + if (!sqe) + return 0; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) @@ -2424,8 +2425,10 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) req->connect.addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); req->connect.addr_len = READ_ONCE(sqe->addr2); - return move_addr_to_kernel(req->connect.addr, req->connect.addr_len, + ret = move_addr_to_kernel(req->connect.addr, req->connect.addr_len, &io->connect.address); + req->sqe = NULL; + return ret; #else return -EOPNOTSUPP; #endif @@ -2526,7 +2529,7 @@ static int io_poll_remove_prep(struct io_kiocb *req) { const struct io_uring_sqe *sqe = req->sqe; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -2535,7 +2538,7 @@ static int io_poll_remove_prep(struct io_kiocb *req) return -EINVAL; req->poll.addr = READ_ONCE(sqe->addr); - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; } @@ -2696,7 +2699,7 @@ static int io_poll_add_prep(struct io_kiocb *req) struct io_poll_iocb *poll = &req->poll; u16 events; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -2705,9 +2708,9 @@ static int io_poll_add_prep(struct io_kiocb *req) if (!poll->file) return -EBADF; - req->flags |= REQ_F_PREPPED; events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + req->sqe = NULL; return 0; } @@ -2845,7 +2848,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req) { const struct io_uring_sqe *sqe = req->sqe; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -2857,7 +2860,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req) if (req->timeout.flags) return -EINVAL; - req->flags |= REQ_F_PREPPED; + req->sqe = NULL; return 0; } @@ -2893,6 +2896,8 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, struct io_timeout_data *data; unsigned flags; + if (!sqe) + return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) @@ -2921,6 +2926,7 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, data->mode = HRTIMER_MODE_REL; hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + req->sqe = NULL; return 0; } @@ -2933,13 +2939,9 @@ static int io_timeout(struct io_kiocb *req) unsigned span = 0; int ret; - if (!req->io) { - if (io_alloc_async_ctx(req)) - return -ENOMEM; - ret = io_timeout_prep(req, req->io, false); - if (ret) - return ret; - } + ret = io_timeout_prep(req, req->io, false); + if (ret) + return ret; data = &req->io->timeout; /* @@ -3069,7 +3071,7 @@ static int io_async_cancel_prep(struct io_kiocb *req) { const struct io_uring_sqe *sqe = req->sqe; - if (req->flags & REQ_F_PREPPED) + if (!sqe) return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3077,8 +3079,8 @@ static int io_async_cancel_prep(struct io_kiocb *req) sqe->cancel_flags) return -EINVAL; - req->flags |= REQ_F_PREPPED; req->cancel.addr = READ_ONCE(sqe->addr); + req->sqe = NULL; return 0; } @@ -3213,13 +3215,9 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_nop(req); break; case IORING_OP_READV: - if (unlikely(req->sqe->buf_index)) - return -EINVAL; ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITEV: - if (unlikely(req->sqe->buf_index)) - return -EINVAL; ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_READ_FIXED: -- cgit v1.2.3 From 3529d8c2b353e6e446277ae96a36e7471cb070fc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Dec 2019 18:24:38 -0700 Subject: io_uring: pass in 'sqe' to the prep handlers This moves the prep handlers outside of the opcode handlers, and allows us to pass in the sqe directly. If the sqe is non-NULL, it means that the request should be prepared for the first time. With the opcode handlers not having access to the sqe at all, we are guaranteed that the prep handler has setup the request fully by the time we get there. As before, for opcodes that need to copy in more data then the io_kiocb allows for, the io_async_ctx holds that info. If a prep handler is invoked with req->io set, it must use that to retain information for later. Finally, we can remove io_kiocb->sqe as well. Signed-off-by: Jens Axboe --- fs/io_uring.c | 493 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 251 insertions(+), 242 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2cdfbb451fe2..562e3a1a1bf9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -398,7 +398,6 @@ struct io_kiocb { struct io_sr_msg sr_msg; }; - const struct io_uring_sqe *sqe; struct io_async_ctx *io; struct file *ring_file; int ring_fd; @@ -629,33 +628,31 @@ static inline bool io_prep_async_work(struct io_kiocb *req, { bool do_hashed = false; - if (req->sqe) { - switch (req->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - /* only regular files should be hashed for writes */ - if (req->flags & REQ_F_ISREG) - do_hashed = true; - /* fall-through */ - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_SENDMSG: - case IORING_OP_RECVMSG: - case IORING_OP_ACCEPT: - case IORING_OP_POLL_ADD: - case IORING_OP_CONNECT: - /* - * We know REQ_F_ISREG is not set on some of these - * opcodes, but this enables us to keep the check in - * just one place. - */ - if (!(req->flags & REQ_F_ISREG)) - req->work.flags |= IO_WQ_WORK_UNBOUND; - break; - } - if (io_req_needs_user(req)) - req->work.flags |= IO_WQ_WORK_NEEDS_USER; + switch (req->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + /* only regular files should be hashed for writes */ + if (req->flags & REQ_F_ISREG) + do_hashed = true; + /* fall-through */ + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_SENDMSG: + case IORING_OP_RECVMSG: + case IORING_OP_ACCEPT: + case IORING_OP_POLL_ADD: + case IORING_OP_CONNECT: + /* + * We know REQ_F_ISREG is not set on some of these + * opcodes, but this enables us to keep the check in + * just one place. + */ + if (!(req->flags & REQ_F_ISREG)) + req->work.flags |= IO_WQ_WORK_UNBOUND; + break; } + if (io_req_needs_user(req)) + req->work.flags |= IO_WQ_WORK_NEEDS_USER; *link = io_prep_linked_timeout(req); return do_hashed; @@ -1491,16 +1488,14 @@ static bool io_file_supports_async(struct file *file) return false; } -static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { - const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; unsigned ioprio; int ret; - if (!sqe) - return 0; if (!req->file) return -EBADF; @@ -1547,12 +1542,11 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) kiocb->ki_complete = io_complete_rw; } - req->rw.addr = READ_ONCE(req->sqe->addr); - req->rw.len = READ_ONCE(req->sqe->len); + req->rw.addr = READ_ONCE(sqe->addr); + req->rw.len = READ_ONCE(sqe->len); /* we own ->private, reuse it for the buffer index */ req->rw.kiocb.private = (void *) (unsigned long) - READ_ONCE(req->sqe->buf_index); - req->sqe = NULL; + READ_ONCE(sqe->buf_index); return 0; } @@ -1800,21 +1794,33 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, return 0; } -static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool force_nonblock) +static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { + struct io_async_ctx *io; + struct iov_iter iter; ssize_t ret; - if (req->sqe) { - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + ret = io_prep_rw(req, sqe, force_nonblock); + if (ret) + return ret; - if (unlikely(!(req->file->f_mode & FMODE_READ))) - return -EBADF; - } + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; - return io_import_iovec(READ, req, iovec, iter); + if (!req->io) + return 0; + + io = req->io; + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(READ, req, &io->rw.iov, &iter); + req->io = io; + if (ret < 0) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; } static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, @@ -1826,7 +1832,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t io_size, ret; - ret = io_read_prep(req, &iovec, &iter, force_nonblock); + ret = io_import_iovec(READ, req, &iovec, &iter); if (ret < 0) return ret; @@ -1887,21 +1893,33 @@ out_free: return ret; } -static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool force_nonblock) +static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { + struct io_async_ctx *io; + struct iov_iter iter; ssize_t ret; - if (req->sqe) { - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + ret = io_prep_rw(req, sqe, force_nonblock); + if (ret) + return ret; - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) - return -EBADF; - } + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; - return io_import_iovec(WRITE, req, iovec, iter); + if (!req->io) + return 0; + + io = req->io; + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); + req->io = io; + if (ret < 0) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; } static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, @@ -1913,7 +1931,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, size_t iov_count; ssize_t ret, io_size; - ret = io_write_prep(req, &iovec, &iter, force_nonblock); + ret = io_import_iovec(WRITE, req, &iovec, &iter); if (ret < 0) return ret; @@ -1995,13 +2013,10 @@ static int io_nop(struct io_kiocb *req) return 0; } -static int io_prep_fsync(struct io_kiocb *req) +static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - if (!req->sqe) - return 0; if (!req->file) return -EBADF; @@ -2016,7 +2031,6 @@ static int io_prep_fsync(struct io_kiocb *req) req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->len); - req->sqe = NULL; return 0; } @@ -2057,11 +2071,6 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct io_wq_work *work, *old_work; - int ret; - - ret = io_prep_fsync(req); - if (ret) - return ret; /* fsync always requires a blocking context */ if (force_nonblock) { @@ -2077,13 +2086,10 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } -static int io_prep_sfr(struct io_kiocb *req) +static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - if (!sqe) - return 0; if (!req->file) return -EBADF; @@ -2095,7 +2101,6 @@ static int io_prep_sfr(struct io_kiocb *req) req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->len); req->sync.flags = READ_ONCE(sqe->sync_range_flags); - req->sqe = NULL; return 0; } @@ -2122,11 +2127,6 @@ static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct io_wq_work *work, *old_work; - int ret; - - ret = io_prep_sfr(req); - if (ret) - return ret; /* sync_file_range always requires a blocking context */ if (force_nonblock) { @@ -2155,22 +2155,21 @@ static void io_sendrecv_async(struct io_wq_work **workptr) } #endif -static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_sr_msg *sr = &req->sr_msg; - int ret; + struct io_async_ctx *io = req->io; - if (!sqe) - return 0; sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + if (!io) + return 0; + io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.iov); - req->sqe = NULL; - return ret; #else return -EOPNOTSUPP; #endif @@ -2201,11 +2200,16 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { + struct io_sr_msg *sr = &req->sr_msg; + kmsg = &io.msg; kmsg->msg.msg_name = &addr; - ret = io_sendmsg_prep(req, &io); + + io.msg.iov = io.msg.fast_iov; + ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, + sr->msg_flags, &io.msg.iov); if (ret) - goto out; + return ret; } flags = req->sr_msg.msg_flags; @@ -2228,7 +2232,6 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, ret = -EINTR; } -out: if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); io_cqring_add_event(req, ret); @@ -2241,22 +2244,22 @@ out: #endif } -static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_recvmsg_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; - int ret; + struct io_async_ctx *io = req->io; + + sr->msg_flags = READ_ONCE(sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (!req->sqe) + if (!io) return 0; - sr->msg_flags = READ_ONCE(req->sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(req->sqe->addr)); io->msg.iov = io->msg.fast_iov; - ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.uaddr, &io->msg.iov); - req->sqe = NULL; - return ret; #else return -EOPNOTSUPP; #endif @@ -2287,11 +2290,17 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { + struct io_sr_msg *sr = &req->sr_msg; + kmsg = &io.msg; kmsg->msg.msg_name = &addr; - ret = io_recvmsg_prep(req, &io); + + io.msg.iov = io.msg.fast_iov; + ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, + sr->msg_flags, &io.msg.uaddr, + &io.msg.iov); if (ret) - goto out; + return ret; } flags = req->sr_msg.msg_flags; @@ -2315,7 +2324,6 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, ret = -EINTR; } -out: if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); io_cqring_add_event(req, ret); @@ -2328,15 +2336,11 @@ out: #endif } -static int io_accept_prep(struct io_kiocb *req) +static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; struct io_accept *accept = &req->accept; - if (!req->sqe) - return 0; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) @@ -2345,7 +2349,6 @@ static int io_accept_prep(struct io_kiocb *req) accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); - req->sqe = NULL; return 0; #else return -EOPNOTSUPP; @@ -2393,10 +2396,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, #if defined(CONFIG_NET) int ret; - ret = io_accept_prep(req); - if (ret) - return ret; - ret = __io_accept(req, nxt, force_nonblock); if (ret == -EAGAIN && force_nonblock) { req->work.func = io_accept_finish; @@ -2410,25 +2409,25 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, #endif } -static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - int ret; + struct io_connect *conn = &req->connect; + struct io_async_ctx *io = req->io; - if (!sqe) - return 0; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; - req->connect.addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->connect.addr_len = READ_ONCE(sqe->addr2); - ret = move_addr_to_kernel(req->connect.addr, req->connect.addr_len, + conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + conn->addr_len = READ_ONCE(sqe->addr2); + + if (!io) + return 0; + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->connect.address); - req->sqe = NULL; - return ret; #else return -EOPNOTSUPP; #endif @@ -2445,7 +2444,9 @@ static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, if (req->io) { io = req->io; } else { - ret = io_connect_prep(req, &__io); + ret = move_addr_to_kernel(req->connect.addr, + req->connect.addr_len, + &__io.connect.address); if (ret) goto out; io = &__io; @@ -2525,12 +2526,9 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) return -ENOENT; } -static int io_poll_remove_prep(struct io_kiocb *req) +static int io_poll_remove_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; - - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || @@ -2538,7 +2536,6 @@ static int io_poll_remove_prep(struct io_kiocb *req) return -EINVAL; req->poll.addr = READ_ONCE(sqe->addr); - req->sqe = NULL; return 0; } @@ -2552,10 +2549,6 @@ static int io_poll_remove(struct io_kiocb *req) u64 addr; int ret; - ret = io_poll_remove_prep(req); - if (ret) - return ret; - addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); ret = io_poll_cancel(ctx, addr); @@ -2693,14 +2686,11 @@ static void io_poll_req_insert(struct io_kiocb *req) hlist_add_head(&req->hash_node, list); } -static int io_poll_add_prep(struct io_kiocb *req) +static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; struct io_poll_iocb *poll = &req->poll; u16 events; - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) @@ -2710,7 +2700,6 @@ static int io_poll_add_prep(struct io_kiocb *req) events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - req->sqe = NULL; return 0; } @@ -2721,11 +2710,6 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) struct io_poll_table ipt; bool cancel = false; __poll_t mask; - int ret; - - ret = io_poll_add_prep(req); - if (ret) - return ret; INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_HLIST_NODE(&req->hash_node); @@ -2844,12 +2828,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return 0; } -static int io_timeout_remove_prep(struct io_kiocb *req) +static int io_timeout_remove_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; - - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) @@ -2860,7 +2841,6 @@ static int io_timeout_remove_prep(struct io_kiocb *req) if (req->timeout.flags) return -EINVAL; - req->sqe = NULL; return 0; } @@ -2872,10 +2852,6 @@ static int io_timeout_remove(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_timeout_remove_prep(req); - if (ret) - return ret; - spin_lock_irq(&ctx->completion_lock); ret = io_timeout_cancel(ctx, req->timeout.addr); @@ -2889,15 +2865,12 @@ static int io_timeout_remove(struct io_kiocb *req) return 0; } -static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, +static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool is_timeout_link) { - const struct io_uring_sqe *sqe = req->sqe; struct io_timeout_data *data; unsigned flags; - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) @@ -2910,7 +2883,7 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, req->timeout.count = READ_ONCE(sqe->off); - if (!io && io_alloc_async_ctx(req)) + if (!req->io && io_alloc_async_ctx(req)) return -ENOMEM; data = &req->io->timeout; @@ -2926,7 +2899,6 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, data->mode = HRTIMER_MODE_REL; hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - req->sqe = NULL; return 0; } @@ -2937,11 +2909,7 @@ static int io_timeout(struct io_kiocb *req) struct io_timeout_data *data; struct list_head *entry; unsigned span = 0; - int ret; - ret = io_timeout_prep(req, req->io, false); - if (ret) - return ret; data = &req->io->timeout; /* @@ -3067,12 +3035,9 @@ done: io_put_req_find_next(req, nxt); } -static int io_async_cancel_prep(struct io_kiocb *req) +static int io_async_cancel_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - const struct io_uring_sqe *sqe = req->sqe; - - if (!sqe) - return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || @@ -3080,28 +3045,20 @@ static int io_async_cancel_prep(struct io_kiocb *req) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); - req->sqe = NULL; return 0; } static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) { struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_async_cancel_prep(req); - if (ret) - return ret; io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); return 0; } -static int io_req_defer_prep(struct io_kiocb *req) +static int io_req_defer_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct io_async_ctx *io = req->io; - struct iov_iter iter; ssize_t ret = 0; switch (req->opcode) { @@ -3109,61 +3066,47 @@ static int io_req_defer_prep(struct io_kiocb *req) break; case IORING_OP_READV: case IORING_OP_READ_FIXED: - /* ensure prep does right import */ - req->io = NULL; - ret = io_read_prep(req, &iovec, &iter, true); - req->io = io; - if (ret < 0) - break; - io_req_map_rw(req, ret, iovec, inline_vecs, &iter); - ret = 0; + ret = io_read_prep(req, sqe, true); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - /* ensure prep does right import */ - req->io = NULL; - ret = io_write_prep(req, &iovec, &iter, true); - req->io = io; - if (ret < 0) - break; - io_req_map_rw(req, ret, iovec, inline_vecs, &iter); - ret = 0; + ret = io_write_prep(req, sqe, true); break; case IORING_OP_POLL_ADD: - ret = io_poll_add_prep(req); + ret = io_poll_add_prep(req, sqe); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove_prep(req); + ret = io_poll_remove_prep(req, sqe); break; case IORING_OP_FSYNC: - ret = io_prep_fsync(req); + ret = io_prep_fsync(req, sqe); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_prep_sfr(req); + ret = io_prep_sfr(req, sqe); break; case IORING_OP_SENDMSG: - ret = io_sendmsg_prep(req, io); + ret = io_sendmsg_prep(req, sqe); break; case IORING_OP_RECVMSG: - ret = io_recvmsg_prep(req, io); + ret = io_recvmsg_prep(req, sqe); break; case IORING_OP_CONNECT: - ret = io_connect_prep(req, io); + ret = io_connect_prep(req, sqe); break; case IORING_OP_TIMEOUT: - ret = io_timeout_prep(req, io, false); + ret = io_timeout_prep(req, sqe, false); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove_prep(req); + ret = io_timeout_remove_prep(req, sqe); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel_prep(req); + ret = io_async_cancel_prep(req, sqe); break; case IORING_OP_LINK_TIMEOUT: - ret = io_timeout_prep(req, io, true); + ret = io_timeout_prep(req, sqe, true); break; case IORING_OP_ACCEPT: - ret = io_accept_prep(req); + ret = io_accept_prep(req, sqe); break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -3175,7 +3118,7 @@ static int io_req_defer_prep(struct io_kiocb *req) return ret; } -static int io_req_defer(struct io_kiocb *req) +static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -3184,10 +3127,10 @@ static int io_req_defer(struct io_kiocb *req) if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - if (io_alloc_async_ctx(req)) + if (!req->io && io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req); + ret = io_req_defer_prep(req, sqe); if (ret < 0) return ret; @@ -3203,9 +3146,8 @@ static int io_req_defer(struct io_kiocb *req) return -EIOCBQUEUED; } -__attribute__((nonnull)) -static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -3215,48 +3157,109 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_nop(req); break; case IORING_OP_READV: - ret = io_read(req, nxt, force_nonblock); - break; - case IORING_OP_WRITEV: - ret = io_write(req, nxt, force_nonblock); - break; case IORING_OP_READ_FIXED: + if (sqe) { + ret = io_read_prep(req, sqe, force_nonblock); + if (ret < 0) + break; + } ret = io_read(req, nxt, force_nonblock); break; + case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + if (sqe) { + ret = io_write_prep(req, sqe, force_nonblock); + if (ret < 0) + break; + } ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: + if (sqe) { + ret = io_prep_fsync(req, sqe); + if (ret < 0) + break; + } ret = io_fsync(req, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: + if (sqe) { + ret = io_poll_add_prep(req, sqe); + if (ret) + break; + } ret = io_poll_add(req, nxt); break; case IORING_OP_POLL_REMOVE: + if (sqe) { + ret = io_poll_remove_prep(req, sqe); + if (ret < 0) + break; + } ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: + if (sqe) { + ret = io_prep_sfr(req, sqe); + if (ret < 0) + break; + } ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: + if (sqe) { + ret = io_sendmsg_prep(req, sqe); + if (ret < 0) + break; + } ret = io_sendmsg(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: + if (sqe) { + ret = io_recvmsg_prep(req, sqe); + if (ret) + break; + } ret = io_recvmsg(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: + if (sqe) { + ret = io_timeout_prep(req, sqe, false); + if (ret) + break; + } ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: + if (sqe) { + ret = io_timeout_remove_prep(req, sqe); + if (ret) + break; + } ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: + if (sqe) { + ret = io_accept_prep(req, sqe); + if (ret) + break; + } ret = io_accept(req, nxt, force_nonblock); break; case IORING_OP_CONNECT: + if (sqe) { + ret = io_connect_prep(req, sqe); + if (ret) + break; + } ret = io_connect(req, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: + if (sqe) { + ret = io_async_cancel_prep(req, sqe); + if (ret) + break; + } ret = io_async_cancel(req, nxt); break; default: @@ -3300,7 +3303,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; req->in_async = true; do { - ret = io_issue_sqe(req, &nxt, false); + ret = io_issue_sqe(req, NULL, &nxt, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -3366,14 +3369,15 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK]; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) +static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd, ret; - flags = READ_ONCE(req->sqe->flags); - fd = READ_ONCE(req->sqe->fd); + flags = READ_ONCE(sqe->flags); + fd = READ_ONCE(sqe->fd); if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; @@ -3505,7 +3509,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; @@ -3514,7 +3518,7 @@ static void __io_queue_sqe(struct io_kiocb *req) again: linked_timeout = io_prep_linked_timeout(req); - ret = io_issue_sqe(req, &nxt, true); + ret = io_issue_sqe(req, sqe, &nxt, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -3561,7 +3565,7 @@ done_req: } } -static void io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; @@ -3571,7 +3575,7 @@ static void io_queue_sqe(struct io_kiocb *req) } req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); - ret = io_req_defer(req); + ret = io_req_defer(req, sqe); if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); @@ -3579,7 +3583,7 @@ static void io_queue_sqe(struct io_kiocb *req) io_double_put_req(req); } } else - __io_queue_sqe(req); + __io_queue_sqe(req, sqe); } static inline void io_queue_link_head(struct io_kiocb *req) @@ -3588,25 +3592,25 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_cqring_add_event(req, -ECANCELED); io_double_put_req(req); } else - io_queue_sqe(req); + io_queue_sqe(req, NULL); } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK) -static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, - struct io_kiocb **link) +static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; int ret; /* enforce forwards compatibility on users */ - if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } - ret = io_req_set_file(state, req); + ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); @@ -3624,10 +3628,10 @@ err_req: if (*link) { struct io_kiocb *prev = *link; - if (req->sqe->flags & IOSQE_IO_DRAIN) + if (sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - if (req->sqe->flags & IOSQE_IO_HARDLINK) + if (sqe->flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; if (io_alloc_async_ctx(req)) { @@ -3635,7 +3639,7 @@ err_req: goto err_req; } - ret = io_req_defer_prep(req); + ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ prev->flags |= REQ_F_FAIL_LINK; @@ -3643,15 +3647,18 @@ err_req: } trace_io_uring_link(ctx, req, prev); list_add_tail(&req->link_list, &prev->link_list); - } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { + } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; - if (req->sqe->flags & IOSQE_IO_HARDLINK) + if (sqe->flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; INIT_LIST_HEAD(&req->link_list); + ret = io_req_defer_prep(req, sqe); + if (ret) + req->flags |= REQ_F_FAIL_LINK; *link = req; } else { - io_queue_sqe(req); + io_queue_sqe(req, sqe); } return true; @@ -3696,14 +3703,15 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } /* - * Fetch an sqe, if one is available. Note that req->sqe will point to memory + * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe **sqe_ptr) { struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; @@ -3730,9 +3738,9 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) * link list. */ req->sequence = ctx->cached_sq_head; - req->sqe = &ctx->sq_sqes[head]; - req->opcode = READ_ONCE(req->sqe->opcode); - req->user_data = READ_ONCE(req->sqe->user_data); + *sqe_ptr = &ctx->sq_sqes[head]; + req->opcode = READ_ONCE((*sqe_ptr)->opcode); + req->user_data = READ_ONCE((*sqe_ptr)->user_data); ctx->cached_sq_head++; return true; } @@ -3764,6 +3772,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } for (i = 0; i < nr; i++) { + const struct io_uring_sqe *sqe; struct io_kiocb *req; unsigned int sqe_flags; @@ -3773,7 +3782,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, req)) { + if (!io_get_sqring(ctx, req, &sqe)) { __io_free_req(req); break; } @@ -3787,7 +3796,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } submitted++; - sqe_flags = req->sqe->flags; + sqe_flags = sqe->flags; req->ring_file = ring_file; req->ring_fd = ring_fd; @@ -3795,7 +3804,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->in_async = async; req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->user_data, true, async); - if (!io_submit_sqe(req, statep, &link)) + if (!io_submit_sqe(req, sqe, statep, &link)) break; /* * If previous wasn't linked and we have a linked command, -- cgit v1.2.3 From 1f424e8bd18754d27b15f49359004b0cea344fb5 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Sun, 22 Dec 2019 22:46:54 +0800 Subject: io-wq: remove unused busy list from io_sqe Commit e61df66c69b1 ("io-wq: ensure free/busy list browsing see all items") added a list for io workers in addition to the free and busy lists, not only making worker walk cleaner, but leaving the busy list unused. Let's remove it. Signed-off-by: Hillf Danton Signed-off-by: Jens Axboe --- fs/io-wq.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 11e80b7252a8..a1c85458f021 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -92,7 +92,6 @@ struct io_wqe { struct io_wqe_acct acct[2]; struct hlist_nulls_head free_list; - struct hlist_nulls_head busy_list; struct list_head all_list; struct io_wq *wq; @@ -327,7 +326,6 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; hlist_nulls_del_init_rcu(&worker->nulls_node); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list); } /* @@ -365,7 +363,6 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) { if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; - hlist_nulls_del_init_rcu(&worker->nulls_node); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); } @@ -798,10 +795,6 @@ void io_wq_cancel_all(struct io_wq *wq) set_bit(IO_WQ_BIT_CANCEL, &wq->state); - /* - * Browse both lists, as there's a gap between handing work off - * to a worker and the worker putting itself on the busy_list - */ rcu_read_lock(); for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; @@ -1049,7 +1042,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1); INIT_LIST_HEAD(&wqe->all_list); } -- cgit v1.2.3 From fd1c4bc6e9b34a5e4fe7a3130a49380ef9d7037c Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 24 Dec 2019 09:14:29 -0700 Subject: io-wq: add cond_resched() to worker thread Reschedule the current IO worker to cut the risk that it is becoming a cpu hog. Signed-off-by: Hillf Danton Signed-off-by: Jens Axboe --- fs/io-wq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io-wq.c b/fs/io-wq.c index a1c85458f021..541c8a3e0bbb 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -429,6 +429,8 @@ next: if (signal_pending(current)) flush_signals(current); + cond_resched(); + spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); -- cgit v1.2.3