summaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c776
1 files changed, 425 insertions, 351 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c14cc104d498..ec53aa7cdc94 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -187,6 +187,7 @@ struct io_ring_ctx {
bool compat;
bool account_mem;
bool cq_overflow_flushed;
+ bool drain_next;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -237,6 +238,8 @@ struct io_ring_ctx {
struct user_struct *user;
+ struct cred *creds;
+
/* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
struct completion *completions;
@@ -279,16 +282,6 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
};
-struct sqe_submit {
- const struct io_uring_sqe *sqe;
- struct file *ring_file;
- int ring_fd;
- u32 sequence;
- bool has_user;
- bool in_async;
- bool needs_fixed_file;
-};
-
/*
* First field must be the file pointer in all the
* iocb unions! See also 'struct kiocb' in <linux/fs.h>
@@ -299,12 +292,20 @@ struct io_poll_iocb {
__poll_t events;
bool done;
bool canceled;
- struct wait_queue_entry wait;
+ struct wait_queue_entry *wait;
+};
+
+struct io_timeout_data {
+ struct io_kiocb *req;
+ struct hrtimer timer;
+ struct timespec64 ts;
+ enum hrtimer_mode mode;
+ u32 seq_offset;
};
struct io_timeout {
struct file *file;
- struct hrtimer timer;
+ struct io_timeout_data *data;
};
/*
@@ -321,7 +322,12 @@ struct io_kiocb {
struct io_timeout timeout;
};
- struct sqe_submit submit;
+ const struct io_uring_sqe *sqe;
+ struct file *ring_file;
+ int ring_fd;
+ bool has_user;
+ bool in_async;
+ bool needs_fixed_file;
struct io_ring_ctx *ctx;
union {
@@ -334,19 +340,20 @@ struct io_kiocb {
#define REQ_F_NOWAIT 1 /* must not punt to workers */
#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
-#define REQ_F_SEQ_PREV 8 /* sequential with previous */
+#define REQ_F_LINK_NEXT 8 /* already grabbed next link */
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */
#define REQ_F_LINK 64 /* linked sqes */
#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
-#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */
+#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
#define REQ_F_TIMEOUT 1024 /* timeout request */
#define REQ_F_ISREG 2048 /* regular file */
#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
+#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */
u64 user_data;
u32 result;
u32 sequence;
@@ -384,6 +391,9 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void __io_free_req(struct io_kiocb *req);
static void io_put_req(struct io_kiocb *req);
static void io_double_put_req(struct io_kiocb *req);
+static void __io_double_put_req(struct io_kiocb *req);
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
+static void io_queue_linked_timeout(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -522,12 +532,13 @@ static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
opcode == IORING_OP_WRITE_FIXED);
}
-static inline bool io_prep_async_work(struct io_kiocb *req)
+static inline bool io_prep_async_work(struct io_kiocb *req,
+ struct io_kiocb **link)
{
bool do_hashed = false;
- if (req->submit.sqe) {
- switch (req->submit.sqe->opcode) {
+ if (req->sqe) {
+ switch (req->sqe->opcode) {
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
do_hashed = true;
@@ -538,6 +549,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
case IORING_OP_RECVMSG:
case IORING_OP_ACCEPT:
case IORING_OP_POLL_ADD:
+ case IORING_OP_CONNECT:
/*
* We know REQ_F_ISREG is not set on some of these
* opcodes, but this enables us to keep the check in
@@ -547,17 +559,21 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
req->work.flags |= IO_WQ_WORK_UNBOUND;
break;
}
- if (io_sqe_needs_user(req->submit.sqe))
+ if (io_sqe_needs_user(req->sqe))
req->work.flags |= IO_WQ_WORK_NEEDS_USER;
}
+ *link = io_prep_linked_timeout(req);
return do_hashed;
}
static inline void io_queue_async_work(struct io_kiocb *req)
{
- bool do_hashed = io_prep_async_work(req);
struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *link;
+ bool do_hashed;
+
+ do_hashed = io_prep_async_work(req, &link);
trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
req->flags);
@@ -567,13 +583,16 @@ static inline void io_queue_async_work(struct io_kiocb *req)
io_wq_enqueue_hashed(ctx->io_wq, &req->work,
file_inode(req->file));
}
+
+ if (link)
+ io_queue_linked_timeout(link);
}
static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
list_del_init(&req->list);
@@ -602,11 +621,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_commit_cqring(ctx);
while ((req = io_get_deferred_req(ctx)) != NULL) {
- if (req->flags & REQ_F_SHADOW_DRAIN) {
- /* Just for drain, free it. */
- __io_free_req(req);
- continue;
- }
req->flags |= REQ_F_IO_DRAINED;
io_queue_async_work(req);
}
@@ -640,7 +654,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
eventfd_signal(ctx->cq_ev_fd, 1);
}
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+/* Returns true if there are no backlogged entries after the flush */
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
struct io_rings *rings = ctx->rings;
struct io_uring_cqe *cqe;
@@ -650,10 +665,10 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (!force) {
if (list_empty_careful(&ctx->cq_overflow_list))
- return;
+ return true;
if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
rings->cq_ring_entries))
- return;
+ return false;
}
spin_lock_irqsave(&ctx->completion_lock, flags);
@@ -662,6 +677,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (force)
ctx->cq_overflow_flushed = true;
+ cqe = NULL;
while (!list_empty(&ctx->cq_overflow_list)) {
cqe = io_get_cqring(ctx);
if (!cqe && !force)
@@ -689,6 +705,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
list_del(&req->list);
io_put_req(req);
}
+
+ return cqe != NULL;
}
static void io_cqring_fill_event(struct io_kiocb *req, long res)
@@ -788,6 +806,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
}
got_it:
+ req->ring_file = NULL;
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
@@ -817,6 +836,8 @@ static void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ if (req->flags & REQ_F_FREE_SQE)
+ kfree(req->sqe);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
if (req->flags & REQ_F_INFLIGHT) {
@@ -828,6 +849,8 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
+ if (req->flags & REQ_F_TIMEOUT)
+ kfree(req->timeout.data);
percpu_ref_put(&ctx->refs);
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
@@ -840,7 +863,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
@@ -858,6 +881,10 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
struct io_kiocb *nxt;
bool wake_ev = false;
+ /* Already got next link */
+ if (req->flags & REQ_F_LINK_NEXT)
+ return;
+
/*
* The list should never be empty when we are called here. But could
* potentially happen if the chain is messed up, check to be on the
@@ -866,31 +893,26 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
while (nxt) {
list_del_init(&nxt->list);
+
+ if ((req->flags & REQ_F_LINK_TIMEOUT) &&
+ (nxt->flags & REQ_F_TIMEOUT)) {
+ wake_ev |= io_link_cancel_timeout(nxt);
+ nxt = list_first_entry_or_null(&req->link_list,
+ struct io_kiocb, list);
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+ continue;
+ }
if (!list_empty(&req->link_list)) {
INIT_LIST_HEAD(&nxt->link_list);
list_splice(&req->link_list, &nxt->link_list);
nxt->flags |= REQ_F_LINK;
}
- /*
- * If we're in async work, we can continue processing the chain
- * in this context instead of having to queue up new async work.
- */
- if (req->flags & REQ_F_LINK_TIMEOUT) {
- wake_ev = io_link_cancel_timeout(nxt);
-
- /* we dropped this link, get next */
- nxt = list_first_entry_or_null(&req->link_list,
- struct io_kiocb, list);
- } else if (nxtptr && io_wq_current_is_worker()) {
- *nxtptr = nxt;
- break;
- } else {
- io_queue_async_work(nxt);
- break;
- }
+ *nxtptr = nxt;
+ break;
}
+ req->flags |= REQ_F_LINK_NEXT;
if (wake_ev)
io_cqring_ev_posted(ctx);
}
@@ -913,12 +935,13 @@ static void io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
if ((req->flags & REQ_F_LINK_TIMEOUT) &&
- link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
+ link->sqe->opcode == IORING_OP_LINK_TIMEOUT) {
io_link_cancel_timeout(link);
} else {
io_cqring_fill_event(link, -ECANCELED);
- io_double_put_req(link);
+ __io_double_put_req(link);
}
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
}
io_commit_cqring(ctx);
@@ -926,12 +949,10 @@ static void io_fail_links(struct io_kiocb *req)
io_cqring_ev_posted(ctx);
}
-static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
{
- if (likely(!(req->flags & REQ_F_LINK))) {
- __io_free_req(req);
+ if (likely(!(req->flags & REQ_F_LINK)))
return;
- }
/*
* If LINK is set, we have dependent requests in this chain. If we
@@ -957,32 +978,30 @@ static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
} else {
io_req_link_next(req, nxt);
}
-
- __io_free_req(req);
}
static void io_free_req(struct io_kiocb *req)
{
- io_free_req_find_next(req, NULL);
+ struct io_kiocb *nxt = NULL;
+
+ io_req_find_next(req, &nxt);
+ __io_free_req(req);
+
+ if (nxt)
+ io_queue_async_work(nxt);
}
/*
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
*/
+__attribute__((nonnull))
static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
- struct io_kiocb *nxt = NULL;
+ io_req_find_next(req, nxtptr);
if (refcount_dec_and_test(&req->refs))
- io_free_req_find_next(req, &nxt);
-
- if (nxt) {
- if (nxtptr)
- *nxtptr = nxt;
- else
- io_queue_async_work(nxt);
- }
+ __io_free_req(req);
}
static void io_put_req(struct io_kiocb *req)
@@ -991,13 +1010,24 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static void io_double_put_req(struct io_kiocb *req)
+/*
+ * Must only be used if we don't need to care about links, usually from
+ * within the completion handling itself.
+ */
+static void __io_double_put_req(struct io_kiocb *req)
{
/* drop both submit and complete references */
if (refcount_sub_and_test(2, &req->refs))
__io_free_req(req);
}
+static void io_double_put_req(struct io_kiocb *req)
+{
+ /* drop both submit and complete references */
+ if (refcount_sub_and_test(2, &req->refs))
+ io_free_req(req);
+}
+
static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
{
struct io_rings *rings = ctx->rings;
@@ -1049,7 +1079,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
* completions for those, only batch free for fixed
* file and non-linked commands.
*/
- if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+ if (((req->flags &
+ (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) ==
REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
@@ -1367,7 +1398,7 @@ static bool io_file_supports_async(struct file *file)
static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
{
- const struct io_uring_sqe *sqe = req->submit.sqe;
+ const struct io_uring_sqe *sqe = req->sqe;
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw;
unsigned ioprio;
@@ -1454,15 +1485,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
bool in_async)
{
- if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw)
+ if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
*nxt = __io_complete_rw(kiocb, ret);
else
io_rw_done(kiocb, ret);
}
-static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
- const struct io_uring_sqe *sqe,
- struct iov_iter *iter)
+static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
+ const struct io_uring_sqe *sqe,
+ struct iov_iter *iter)
{
size_t len = READ_ONCE(sqe->len);
struct io_mapped_ubuf *imu;
@@ -1534,11 +1565,10 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
return len;
}
-static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
- const struct sqe_submit *s, struct iovec **iovec,
- struct iov_iter *iter)
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter)
{
- const struct io_uring_sqe *sqe = s->sqe;
+ const struct io_uring_sqe *sqe = req->sqe;
void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
size_t sqe_len = READ_ONCE(sqe->len);
u8 opcode;
@@ -1552,18 +1582,16 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
* flag.
*/
opcode = READ_ONCE(sqe->opcode);
- if (opcode == IORING_OP_READ_FIXED ||
- opcode == IORING_OP_WRITE_FIXED) {
- ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+ if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
*iovec = NULL;
- return ret;
+ return io_import_fixed(req->ctx, rw, sqe, iter);
}
- if (!s->has_user)
+ if (!req->has_user)
return -EFAULT;
#ifdef CONFIG_COMPAT
- if (ctx->compat)
+ if (req->ctx->compat)
return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
iovec, iter);
#endif
@@ -1591,9 +1619,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return -EAGAIN;
while (iov_iter_count(iter)) {
- struct iovec iovec = iov_iter_iovec(iter);
+ struct iovec iovec;
ssize_t nr;
+ if (!iov_iter_is_bvec(iter)) {
+ iovec = iov_iter_iovec(iter);
+ } else {
+ /* fixed buffers import bvec */
+ iovec.iov_base = kmap(iter->bvec->bv_page)
+ + iter->iov_offset;
+ iovec.iov_len = min(iter->count,
+ iter->bvec->bv_len - iter->iov_offset);
+ }
+
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
iovec.iov_len, &kiocb->ki_pos);
@@ -1602,6 +1640,9 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
iovec.iov_len, &kiocb->ki_pos);
}
+ if (iov_iter_is_bvec(iter))
+ kunmap(iter->bvec->bv_page);
+
if (nr < 0) {
if (!ret)
ret = nr;
@@ -1634,7 +1675,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
if (unlikely(!(file->f_mode & FMODE_READ)))
return -EBADF;
- ret = io_import_iovec(req->ctx, READ, &req->submit, &iovec, &iter);
+ ret = io_import_iovec(READ, req, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1666,7 +1707,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN)
- kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
+ kiocb_done(kiocb, ret2, nxt, req->in_async);
else
ret = -EAGAIN;
}
@@ -1692,7 +1733,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
if (unlikely(!(file->f_mode & FMODE_WRITE)))
return -EBADF;
- ret = io_import_iovec(req->ctx, WRITE, &req->submit, &iovec, &iter);
+ ret = io_import_iovec(WRITE, req, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1729,7 +1770,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
else
ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
if (!force_nonblock || ret2 != -EAGAIN)
- kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
+ kiocb_done(kiocb, ret2, nxt, req->in_async);
else
ret = -EAGAIN;
}
@@ -1919,7 +1960,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ if (sqe->ioprio || sqe->len || sqe->buf_index)
return -EINVAL;
addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
@@ -1944,6 +1985,38 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
#endif
}
+static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct sockaddr __user *addr;
+ unsigned file_flags;
+ int addr_len, ret;
+
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
+ return -EINVAL;
+
+ addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
+ addr_len = READ_ONCE(sqe->addr2);
+ file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+ ret = __sys_connect_file(req->file, addr, addr_len, file_flags);
+ if (ret == -EAGAIN && force_nonblock)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
static inline void io_poll_remove_req(struct io_kiocb *req)
{
if (!RB_EMPTY_NODE(&req->rb_node)) {
@@ -1958,8 +2031,8 @@ static void io_poll_remove_one(struct io_kiocb *req)
spin_lock(&poll->head->lock);
WRITE_ONCE(poll->canceled, true);
- if (!list_empty(&poll->wait.entry)) {
- list_del_init(&poll->wait.entry);
+ if (!list_empty(&poll->wait->entry)) {
+ list_del_init(&poll->wait->entry);
io_queue_async_work(req);
}
spin_unlock(&poll->head->lock);
@@ -2027,12 +2100,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
req->poll.done = true;
- io_cqring_fill_event(req, mangle_poll(mask));
+ kfree(req->poll.wait);
+ if (error)
+ io_cqring_fill_event(req, error);
+ else
+ io_cqring_fill_event(req, mangle_poll(mask));
io_commit_cqring(ctx);
}
@@ -2045,11 +2122,16 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt = NULL;
__poll_t mask = 0;
+ int ret = 0;
- if (work->flags & IO_WQ_WORK_CANCEL)
+ if (work->flags & IO_WQ_WORK_CANCEL) {
WRITE_ONCE(poll->canceled, true);
+ ret = -ECANCELED;
+ } else if (READ_ONCE(poll->canceled)) {
+ ret = -ECANCELED;
+ }
- if (!READ_ONCE(poll->canceled))
+ if (ret != -ECANCELED)
mask = vfs_poll(poll->file, &pt) & poll->events;
/*
@@ -2060,17 +2142,19 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->completion_lock);
- if (!mask && !READ_ONCE(poll->canceled)) {
- add_wait_queue(poll->head, &poll->wait);
+ if (!mask && ret != -ECANCELED) {
+ add_wait_queue(poll->head, poll->wait);
spin_unlock_irq(&ctx->completion_lock);
return;
}
io_poll_remove_req(req);
- io_poll_complete(req, mask);
+ io_poll_complete(req, mask, ret);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
+ if (ret < 0 && req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
io_put_req_find_next(req, &nxt);
if (nxt)
*workptr = &nxt->work;
@@ -2079,8 +2163,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
- struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
- wait);
+ struct io_poll_iocb *poll = wait->private;
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
struct io_ring_ctx *ctx = req->ctx;
__poll_t mask = key_to_poll(key);
@@ -2090,7 +2173,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & poll->events))
return 0;
- list_del_init(&poll->wait.entry);
+ list_del_init(&poll->wait->entry);
/*
* Run completion inline if we can. We're using trylock here because
@@ -2100,7 +2183,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
*/
if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
io_poll_remove_req(req);
- io_poll_complete(req, mask);
+ io_poll_complete(req, mask, 0);
req->flags |= REQ_F_COMP_LOCKED;
io_put_req(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -2131,7 +2214,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
pt->error = 0;
pt->req->poll.head = head;
- add_wait_queue(head, &pt->req->poll.wait);
+ add_wait_queue(head, pt->req->poll.wait);
}
static void io_poll_req_insert(struct io_kiocb *req)
@@ -2170,7 +2253,11 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (!poll->file)
return -EBADF;
- req->submit.sqe = NULL;
+ poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL);
+ if (!poll->wait)
+ return -ENOMEM;
+
+ req->sqe = NULL;
INIT_IO_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
@@ -2186,8 +2273,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+ INIT_LIST_HEAD(&poll->wait->entry);
+ init_waitqueue_func_entry(poll->wait, io_poll_wake);
+ poll->wait->private = poll;
INIT_LIST_HEAD(&req->list);
@@ -2196,14 +2284,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
spin_lock_irq(&ctx->completion_lock);
if (likely(poll->head)) {
spin_lock(&poll->head->lock);
- if (unlikely(list_empty(&poll->wait.entry))) {
+ if (unlikely(list_empty(&poll->wait->entry))) {
if (ipt.error)
cancel = true;
ipt.error = 0;
mask = 0;
}
if (mask || ipt.error)
- list_del_init(&poll->wait.entry);
+ list_del_init(&poll->wait->entry);
else if (cancel)
WRITE_ONCE(poll->canceled, true);
else if (!poll->done) /* actually waiting for an event */
@@ -2212,7 +2300,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- io_poll_complete(req, mask);
+ io_poll_complete(req, mask, 0);
}
spin_unlock_irq(&ctx->completion_lock);
@@ -2225,12 +2313,12 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
- struct io_ring_ctx *ctx;
- struct io_kiocb *req;
+ struct io_timeout_data *data = container_of(timer,
+ struct io_timeout_data, timer);
+ struct io_kiocb *req = data->req;
+ struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
- req = container_of(timer, struct io_kiocb, timeout.timer);
- ctx = req->ctx;
atomic_inc(&ctx->cq_timeouts);
spin_lock_irqsave(&ctx->completion_lock, flags);
@@ -2280,10 +2368,12 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (ret == -ENOENT)
return ret;
- ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
if (ret == -1)
return -EALREADY;
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
io_cqring_fill_event(req, -ECANCELED);
io_put_req(req);
return 0;
@@ -2320,34 +2410,54 @@ static int io_timeout_remove(struct io_kiocb *req,
return 0;
}
-static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_timeout_setup(struct io_kiocb *req)
{
- unsigned count;
- struct io_ring_ctx *ctx = req->ctx;
- struct list_head *entry;
- enum hrtimer_mode mode;
- struct timespec64 ts;
- unsigned span = 0;
+ const struct io_uring_sqe *sqe = req->sqe;
+ struct io_timeout_data *data;
unsigned flags;
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1)
+ if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~IORING_TIMEOUT_ABS)
return -EINVAL;
- if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
+ data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->req = req;
+ req->timeout.data = data;
+ req->flags |= REQ_F_TIMEOUT;
+
+ if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
if (flags & IORING_TIMEOUT_ABS)
- mode = HRTIMER_MODE_ABS;
+ data->mode = HRTIMER_MODE_ABS;
else
- mode = HRTIMER_MODE_REL;
+ data->mode = HRTIMER_MODE_REL;
- hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode);
- req->flags |= REQ_F_TIMEOUT;
+ hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+ return 0;
+}
+
+static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ unsigned count;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_timeout_data *data;
+ struct list_head *entry;
+ unsigned span = 0;
+ int ret;
+
+ ret = io_timeout_setup(req);
+ /* common setup allows flags (like links) set, we don't */
+ if (!ret && sqe->flags)
+ ret = -EINVAL;
+ if (ret)
+ return ret;
/*
* sqe->off holds how many events that need to occur for this
@@ -2363,8 +2473,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
req->sequence = ctx->cached_sq_head + count - 1;
- /* reuse it to store the count */
- req->submit.sequence = count;
+ req->timeout.data->seq_offset = count;
/*
* Insertion sort, ensuring the first entry in the list is always
@@ -2375,6 +2484,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
unsigned nxt_sq_head;
long long tmp, tmp_nxt;
+ u32 nxt_offset = nxt->timeout.data->seq_offset;
if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
continue;
@@ -2384,8 +2494,8 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
* long to store it.
*/
tmp = (long long)ctx->cached_sq_head + count - 1;
- nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
- tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
+ nxt_sq_head = nxt->sequence - nxt_offset + 1;
+ tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
/*
* cached_sq_head may overflow, and it will never overflow twice
@@ -2407,8 +2517,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->sequence -= span;
add:
list_add(&req->list, entry);
- req->timeout.timer.function = io_timeout_fn;
- hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode);
+ data = req->timeout.data;
+ data->timer.function = io_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
spin_unlock_irq(&ctx->completion_lock);
return 0;
}
@@ -2443,7 +2554,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
struct io_kiocb *req, __u64 sqe_addr,
- struct io_kiocb **nxt)
+ struct io_kiocb **nxt, int success_ret)
{
unsigned long flags;
int ret;
@@ -2460,6 +2571,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
goto done;
ret = io_poll_cancel(ctx, sqe_addr);
done:
+ if (!ret)
+ ret = success_ret;
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -2481,13 +2594,12 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
sqe->cancel_flags)
return -EINVAL;
- io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL);
+ io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0);
return 0;
}
static int io_req_defer(struct io_kiocb *req)
{
- const struct io_uring_sqe *sqe = req->submit.sqe;
struct io_uring_sqe *sqe_copy;
struct io_ring_ctx *ctx = req->ctx;
@@ -2506,34 +2618,35 @@ static int io_req_defer(struct io_kiocb *req)
return 0;
}
- memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
- req->submit.sqe = sqe_copy;
+ memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy));
+ req->flags |= REQ_F_FREE_SQE;
+ req->sqe = sqe_copy;
- trace_io_uring_defer(ctx, req, false);
+ trace_io_uring_defer(ctx, req, req->user_data);
list_add_tail(&req->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
return -EIOCBQUEUED;
}
-static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+__attribute__((nonnull))
+static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
{
int ret, opcode;
- struct sqe_submit *s = &req->submit;
struct io_ring_ctx *ctx = req->ctx;
- opcode = READ_ONCE(s->sqe->opcode);
+ opcode = READ_ONCE(req->sqe->opcode);
switch (opcode) {
case IORING_OP_NOP:
ret = io_nop(req);
break;
case IORING_OP_READV:
- if (unlikely(s->sqe->buf_index))
+ if (unlikely(req->sqe->buf_index))
return -EINVAL;
ret = io_read(req, nxt, force_nonblock);
break;
case IORING_OP_WRITEV:
- if (unlikely(s->sqe->buf_index))
+ if (unlikely(req->sqe->buf_index))
return -EINVAL;
ret = io_write(req, nxt, force_nonblock);
break;
@@ -2544,34 +2657,37 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
ret = io_write(req, nxt, force_nonblock);
break;
case IORING_OP_FSYNC:
- ret = io_fsync(req, s->sqe, nxt, force_nonblock);
+ ret = io_fsync(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_POLL_ADD:
- ret = io_poll_add(req, s->sqe, nxt);
+ ret = io_poll_add(req, req->sqe, nxt);
break;
case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove(req, s->sqe);
+ ret = io_poll_remove(req, req->sqe);
break;
case IORING_OP_SYNC_FILE_RANGE:
- ret = io_sync_file_range(req, s->sqe, nxt, force_nonblock);
+ ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_SENDMSG:
- ret = io_sendmsg(req, s->sqe, nxt, force_nonblock);
+ ret = io_sendmsg(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_RECVMSG:
- ret = io_recvmsg(req, s->sqe, nxt, force_nonblock);
+ ret = io_recvmsg(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_TIMEOUT:
- ret = io_timeout(req, s->sqe);
+ ret = io_timeout(req, req->sqe);
break;
case IORING_OP_TIMEOUT_REMOVE:
- ret = io_timeout_remove(req, s->sqe);
+ ret = io_timeout_remove(req, req->sqe);
break;
case IORING_OP_ACCEPT:
- ret = io_accept(req, s->sqe, nxt, force_nonblock);
+ ret = io_accept(req, req->sqe, nxt, force_nonblock);
+ break;
+ case IORING_OP_CONNECT:
+ ret = io_connect(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_ASYNC_CANCEL:
- ret = io_async_cancel(req, s->sqe, nxt);
+ ret = io_async_cancel(req, req->sqe, nxt);
break;
default:
ret = -EINVAL;
@@ -2586,22 +2702,29 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
return -EAGAIN;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (s->in_async)
+ if (req->in_async)
mutex_lock(&ctx->uring_lock);
io_iopoll_req_issued(req);
- if (s->in_async)
+ if (req->in_async)
mutex_unlock(&ctx->uring_lock);
}
return 0;
}
+static void io_link_work_cb(struct io_wq_work **workptr)
+{
+ struct io_wq_work *work = *workptr;
+ struct io_kiocb *link = work->data;
+
+ io_queue_linked_timeout(link);
+ work->func = io_wq_submit_work;
+}
+
static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct sqe_submit *s = &req->submit;
- const struct io_uring_sqe *sqe = s->sqe;
struct io_kiocb *nxt = NULL;
int ret = 0;
@@ -2612,10 +2735,10 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
ret = -ECANCELED;
if (!ret) {
- s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
- s->in_async = true;
+ req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
+ req->in_async = true;
do {
- ret = __io_submit_sqe(req, &nxt, false);
+ ret = io_issue_sqe(req, &nxt, false);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -2637,13 +2760,17 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
io_put_req(req);
}
- /* async context always use a copy of the sqe */
- kfree(sqe);
-
/* if a dependent link is ready, pass it back */
if (!ret && nxt) {
- io_prep_async_work(nxt);
+ struct io_kiocb *link;
+
+ io_prep_async_work(nxt, &link);
*workptr = &nxt->work;
+ if (link) {
+ nxt->work.flags |= IO_WQ_WORK_CB;
+ nxt->work.func = io_link_work_cb;
+ nxt->work.data = link;
+ }
}
}
@@ -2675,24 +2802,17 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
{
- struct sqe_submit *s = &req->submit;
struct io_ring_ctx *ctx = req->ctx;
unsigned flags;
int fd;
- flags = READ_ONCE(s->sqe->flags);
- fd = READ_ONCE(s->sqe->fd);
+ flags = READ_ONCE(req->sqe->flags);
+ fd = READ_ONCE(req->sqe->fd);
if (flags & IOSQE_IO_DRAIN)
req->flags |= REQ_F_IO_DRAIN;
- /*
- * All io need record the previous position, if LINK vs DARIN,
- * it can be used to mark the position of the first IO in the
- * link list.
- */
- req->sequence = s->sequence;
- if (!io_op_needs_file(s->sqe))
+ if (!io_op_needs_file(req->sqe))
return 0;
if (flags & IOSQE_FIXED_FILE) {
@@ -2705,7 +2825,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
return -EBADF;
req->flags |= REQ_F_FIXED_FILE;
} else {
- if (s->needs_fixed_file)
+ if (req->needs_fixed_file)
return -EBADF;
trace_io_uring_file_get(ctx, fd);
req->file = io_file_get(state, fd);
@@ -2729,7 +2849,7 @@ static int io_grab_files(struct io_kiocb *req)
* the fd has changed since we started down this path, and disallow
* this operation if it has.
*/
- if (fcheck(req->submit.ring_fd) == req->submit.ring_file) {
+ if (fcheck(req->ring_fd) == req->ring_file) {
list_add(&req->inflight_entry, &ctx->inflight_list);
req->flags |= REQ_F_INFLIGHT;
req->work.files = current->files;
@@ -2743,8 +2863,9 @@ static int io_grab_files(struct io_kiocb *req)
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
- struct io_kiocb *req = container_of(timer, struct io_kiocb,
- timeout.timer);
+ struct io_timeout_data *data = container_of(timer,
+ struct io_timeout_data, timer);
+ struct io_kiocb *req = data->req;
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *prev = NULL;
unsigned long flags;
@@ -2757,16 +2878,20 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
*/
if (!list_empty(&req->list)) {
prev = list_entry(req->list.prev, struct io_kiocb, link_list);
- if (refcount_inc_not_zero(&prev->refs))
+ if (refcount_inc_not_zero(&prev->refs)) {
list_del_init(&req->list);
- else
+ prev->flags &= ~REQ_F_LINK_TIMEOUT;
+ } else
prev = NULL;
}
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (prev) {
- io_async_find_and_cancel(ctx, req, prev->user_data, NULL);
+ if (prev->flags & REQ_F_LINK)
+ prev->flags |= REQ_F_FAIL_LINK;
+ io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
+ -ETIME);
io_put_req(prev);
} else {
io_cqring_add_event(req, -ETIME);
@@ -2775,8 +2900,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
- enum hrtimer_mode *mode)
+static void io_queue_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2786,9 +2910,11 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
*/
spin_lock_irq(&ctx->completion_lock);
if (!list_empty(&req->list)) {
- req->timeout.timer.function = io_link_timeout_fn;
- hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts),
- *mode);
+ struct io_timeout_data *data = req->timeout.data;
+
+ data->timer.function = io_link_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
+ data->mode);
}
spin_unlock_irq(&ctx->completion_lock);
@@ -2796,66 +2922,30 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
io_put_req(req);
}
-static int io_validate_link_timeout(const struct io_uring_sqe *sqe,
- struct timespec64 *ts)
-{
- if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off)
- return -EINVAL;
- if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS)
- return -EINVAL;
- if (get_timespec64(ts, u64_to_user_ptr(sqe->addr)))
- return -EFAULT;
-
- return 0;
-}
-
-static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
- struct timespec64 *ts,
- enum hrtimer_mode *mode)
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
struct io_kiocb *nxt;
- int ret;
if (!(req->flags & REQ_F_LINK))
return NULL;
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
- if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT)
+ if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT)
return NULL;
- ret = io_validate_link_timeout(nxt->submit.sqe, ts);
- if (ret) {
- list_del_init(&nxt->list);
- io_cqring_add_event(nxt, ret);
- io_double_put_req(nxt);
- return ERR_PTR(-ECANCELED);
- }
-
- if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS)
- *mode = HRTIMER_MODE_ABS;
- else
- *mode = HRTIMER_MODE_REL;
-
req->flags |= REQ_F_LINK_TIMEOUT;
- hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode);
return nxt;
}
-static int __io_queue_sqe(struct io_kiocb *req)
+static void __io_queue_sqe(struct io_kiocb *req)
{
- enum hrtimer_mode mode;
- struct io_kiocb *nxt;
- struct timespec64 ts;
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ struct io_kiocb *nxt = NULL;
int ret;
- nxt = io_prep_linked_timeout(req, &ts, &mode);
- if (IS_ERR(nxt)) {
- ret = PTR_ERR(nxt);
- nxt = NULL;
- goto err;
- }
-
- ret = __io_submit_sqe(req, NULL, true);
+ ret = io_issue_sqe(req, &nxt, true);
+ if (nxt)
+ io_queue_async_work(nxt);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2863,42 +2953,38 @@ static int __io_queue_sqe(struct io_kiocb *req)
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
- struct sqe_submit *s = &req->submit;
struct io_uring_sqe *sqe_copy;
- sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
- if (sqe_copy) {
- s->sqe = sqe_copy;
- if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
- ret = io_grab_files(req);
- if (ret) {
- kfree(sqe_copy);
- goto err;
- }
- }
-
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req);
+ sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+ if (!sqe_copy)
+ goto err;
- if (nxt)
- io_queue_linked_timeout(nxt, &ts, &mode);
+ req->sqe = sqe_copy;
+ req->flags |= REQ_F_FREE_SQE;
- return 0;
+ if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
+ ret = io_grab_files(req);
+ if (ret)
+ goto err;
}
+
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req);
+ return;
}
err:
/* drop submission reference */
io_put_req(req);
- if (nxt) {
+ if (linked_timeout) {
if (!ret)
- io_queue_linked_timeout(nxt, &ts, &mode);
+ io_queue_linked_timeout(linked_timeout);
else
- io_put_req(nxt);
+ io_put_req(linked_timeout);
}
/* and drop final reference, if we failed */
@@ -2908,83 +2994,52 @@ err:
req->flags |= REQ_F_FAIL_LINK;
io_put_req(req);
}
-
- return ret;
}
-static int io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe(struct io_kiocb *req)
{
int ret;
- ret = io_req_defer(req);
- if (ret) {
- if (ret != -EIOCBQUEUED) {
- io_cqring_add_event(req, ret);
- io_double_put_req(req);
- }
- return 0;
+ if (unlikely(req->ctx->drain_next)) {
+ req->flags |= REQ_F_IO_DRAIN;
+ req->ctx->drain_next = false;
}
+ req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
- return __io_queue_sqe(req);
-}
-
-static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
-{
- int ret;
- int need_submit = false;
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!shadow)
- return io_queue_sqe(req);
-
- /*
- * Mark the first IO in link list as DRAIN, let all the following
- * IOs enter the defer list. all IO needs to be completed before link
- * list.
- */
- req->flags |= REQ_F_IO_DRAIN;
ret = io_req_defer(req);
if (ret) {
if (ret != -EIOCBQUEUED) {
io_cqring_add_event(req, ret);
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
io_double_put_req(req);
- __io_free_req(shadow);
- return 0;
}
- } else {
- /*
- * If ret == 0 means that all IOs in front of link io are
- * running done. let's queue link head.
- */
- need_submit = true;
- }
-
- /* Insert shadow req to defer_list, blocking next IOs */
- spin_lock_irq(&ctx->completion_lock);
- trace_io_uring_defer(ctx, shadow, true);
- list_add_tail(&shadow->list, &ctx->defer_list);
- spin_unlock_irq(&ctx->completion_lock);
-
- if (need_submit)
- return __io_queue_sqe(req);
+ } else
+ __io_queue_sqe(req);
+}
- return 0;
+static inline void io_queue_link_head(struct io_kiocb *req)
+{
+ if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+ io_cqring_add_event(req, -ECANCELED);
+ io_double_put_req(req);
+ } else
+ io_queue_sqe(req);
}
+
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
struct io_kiocb **link)
{
- struct io_uring_sqe *sqe_copy;
- struct sqe_submit *s = &req->submit;
struct io_ring_ctx *ctx = req->ctx;
int ret;
- req->user_data = s->sqe->user_data;
+ req->user_data = req->sqe->user_data;
/* enforce forwards compatibility on users */
- if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
+ if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) {
ret = -EINVAL;
goto err_req;
}
@@ -3006,25 +3061,37 @@ err_req:
*/
if (*link) {
struct io_kiocb *prev = *link;
+ struct io_uring_sqe *sqe_copy;
- sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+ if (req->sqe->flags & IOSQE_IO_DRAIN)
+ (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
+
+ if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
+ ret = io_timeout_setup(req);
+ /* common setup allows offset being set, we don't */
+ if (!ret && req->sqe->off)
+ ret = -EINVAL;
+ if (ret) {
+ prev->flags |= REQ_F_FAIL_LINK;
+ goto err_req;
+ }
+ }
+
+ sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
if (!sqe_copy) {
ret = -EAGAIN;
goto err_req;
}
- s->sqe = sqe_copy;
+ req->sqe = sqe_copy;
+ req->flags |= REQ_F_FREE_SQE;
trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->list, &prev->link_list);
- } else if (s->sqe->flags & IOSQE_IO_LINK) {
+ } else if (req->sqe->flags & IOSQE_IO_LINK) {
req->flags |= REQ_F_LINK;
INIT_LIST_HEAD(&req->link_list);
*link = req;
- } else if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
- /* Only valid as a linked SQE */
- ret = -EINVAL;
- goto err_req;
} else {
io_queue_sqe(req);
}
@@ -3076,7 +3143,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req)
{
struct io_rings *rings = ctx->rings;
u32 *sq_array = ctx->sq_array;
@@ -3092,14 +3159,18 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
*/
head = ctx->cached_sq_head;
/* make sure SQ entry isn't read before tail */
- if (head == smp_load_acquire(&rings->sq.tail))
+ if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
return false;
head = READ_ONCE(sq_array[head & ctx->sq_mask]);
- if (head < ctx->sq_entries) {
- s->ring_file = NULL;
- s->sqe = &ctx->sq_sqes[head];
- s->sequence = ctx->cached_sq_head;
+ if (likely(head < ctx->sq_entries)) {
+ /*
+ * All io need record the previous position, if LINK vs DARIN,
+ * it can be used to mark the position of the first IO in the
+ * link list.
+ */
+ req->sequence = ctx->cached_sq_head;
+ req->sqe = &ctx->sq_sqes[head];
ctx->cached_sq_head++;
return true;
}
@@ -3117,14 +3188,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
- struct io_kiocb *shadow_req = NULL;
int i, submitted = 0;
bool mm_fault = false;
- if (!list_empty(&ctx->cq_overflow_list)) {
- io_cqring_overflow_flush(ctx, false);
+ /* if we have a backlog and couldn't flush it all, return BUSY */
+ if (!list_empty(&ctx->cq_overflow_list) &&
+ !io_cqring_overflow_flush(ctx, false))
return -EBUSY;
- }
if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, ctx, nr);
@@ -3141,12 +3211,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
submitted = -EAGAIN;
break;
}
- if (!io_get_sqring(ctx, &req->submit)) {
+ if (!io_get_sqring(ctx, req)) {
__io_free_req(req);
break;
}
- if (io_sqe_needs_user(req->submit.sqe) && !*mm) {
+ if (io_sqe_needs_user(req->sqe) && !*mm) {
mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
if (!mm_fault) {
use_mm(ctx->sqo_mm);
@@ -3154,26 +3224,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
}
}
- sqe_flags = req->submit.sqe->flags;
+ sqe_flags = req->sqe->flags;
- if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
- if (!shadow_req) {
- shadow_req = io_get_req(ctx, NULL);
- if (unlikely(!shadow_req))
- goto out;
- shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
- refcount_dec(&shadow_req->refs);
- }
- shadow_req->sequence = req->submit.sequence;
- }
-
-out:
- req->submit.ring_file = ring_file;
- req->submit.ring_fd = ring_fd;
- req->submit.has_user = *mm != NULL;
- req->submit.in_async = async;
- req->submit.needs_fixed_file = async;
- trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data,
+ req->ring_file = ring_file;
+ req->ring_fd = ring_fd;
+ req->has_user = *mm != NULL;
+ req->in_async = async;
+ req->needs_fixed_file = async;
+ trace_io_uring_submit_sqe(ctx, req->sqe->user_data,
true, async);
io_submit_sqe(req, statep, &link);
submitted++;
@@ -3183,14 +3241,13 @@ out:
* that's the end of the chain. Submit the previous link.
*/
if (!(sqe_flags & IOSQE_IO_LINK) && link) {
- io_queue_link_head(link, shadow_req);
+ io_queue_link_head(link);
link = NULL;
- shadow_req = NULL;
}
}
if (link)
- io_queue_link_head(link, shadow_req);
+ io_queue_link_head(link);
if (statep)
io_submit_state_end(&state);
@@ -3204,6 +3261,7 @@ static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
struct mm_struct *cur_mm = NULL;
+ const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
unsigned inflight;
@@ -3214,6 +3272,7 @@ static int io_sq_thread(void *data)
old_fs = get_fs();
set_fs(USER_DS);
+ old_cred = override_creds(ctx->creds);
ret = timeout = inflight = 0;
while (!kthread_should_park()) {
@@ -3320,6 +3379,7 @@ static int io_sq_thread(void *data)
unuse_mm(cur_mm);
mmput(cur_mm);
}
+ revert_creds(old_cred);
kthread_parkme();
@@ -3899,6 +3959,7 @@ static void io_get_work(struct io_wq_work *work)
static int io_sq_offload_start(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
+ struct io_wq_data data;
unsigned concurrency;
int ret;
@@ -3943,10 +4004,15 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
goto err;
}
+ data.mm = ctx->sqo_mm;
+ data.user = ctx->user;
+ data.creds = ctx->creds;
+ data.get_work = io_get_work;
+ data.put_work = io_put_work;
+
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
- ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user,
- io_get_work, io_put_work);
+ ctx->io_wq = io_wq_create(concurrency, &data);
if (IS_ERR(ctx->io_wq)) {
ret = PTR_ERR(ctx->io_wq);
ctx->io_wq = NULL;
@@ -4295,6 +4361,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_unaccount_mem(ctx->user,
ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
+ put_cred(ctx->creds);
kfree(ctx->completions);
kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx);
@@ -4577,12 +4644,18 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
ctx->cq_entries = rings->cq_ring_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
- if (size == SIZE_MAX)
+ if (size == SIZE_MAX) {
+ io_mem_free(ctx->rings);
+ ctx->rings = NULL;
return -EOVERFLOW;
+ }
ctx->sq_sqes = io_mem_alloc(size);
- if (!ctx->sq_sqes)
+ if (!ctx->sq_sqes) {
+ io_mem_free(ctx->rings);
+ ctx->rings = NULL;
return -ENOMEM;
+ }
return 0;
}
@@ -4686,6 +4759,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
ctx->compat = in_compat_syscall();
ctx->account_mem = account_mem;
ctx->user = user;
+ ctx->creds = prepare_creds();
ret = io_allocate_scq_urings(ctx, p);
if (ret)