summaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c1699
1 files changed, 898 insertions, 801 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5190bfb6a665..a78201b96179 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -55,7 +55,6 @@
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
-#include <linux/mmu_context.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/kthread.h>
@@ -142,7 +141,7 @@ struct io_rings {
*/
u32 sq_dropped;
/*
- * Runtime flags
+ * Runtime SQ flags
*
* Written by the kernel, shouldn't be modified by the
* application.
@@ -152,6 +151,13 @@ struct io_rings {
*/
u32 sq_flags;
/*
+ * Runtime CQ flags
+ *
+ * Written by the application, shouldn't be modified by the
+ * kernel.
+ */
+ u32 cq_flags;
+ /*
* Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure
* there are not more requests pending than there is space in
@@ -191,7 +197,7 @@ struct fixed_file_ref_node {
struct list_head node;
struct list_head file_list;
struct fixed_file_data *file_data;
- struct work_struct work;
+ struct llist_node llist;
};
struct fixed_file_data {
@@ -279,8 +285,8 @@ struct io_ring_ctx {
const struct cred *creds;
- /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
- struct completion *completions;
+ struct completion ref_comp;
+ struct completion sq_thread_comp;
/* if all else fails... */
struct io_kiocb *fallback_req;
@@ -327,6 +333,9 @@ struct io_ring_ctx {
struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
+ struct delayed_work file_put_work;
+ struct llist_head file_put_llist;
+
struct work_struct exit_work;
};
@@ -357,7 +366,6 @@ struct io_timeout_data {
struct hrtimer timer;
struct timespec64 ts;
enum hrtimer_mode mode;
- u32 seq_offset;
};
struct io_accept {
@@ -385,7 +393,8 @@ struct io_timeout {
struct file *file;
u64 addr;
int flags;
- unsigned count;
+ u32 off;
+ u32 target_seq;
};
struct io_rw {
@@ -416,11 +425,7 @@ struct io_sr_msg {
struct io_open {
struct file *file;
int dfd;
- union {
- unsigned mask;
- };
struct filename *filename;
- struct statx __user *buffer;
struct open_how how;
unsigned long nofile;
};
@@ -472,6 +477,15 @@ struct io_provide_buf {
__u16 bid;
};
+struct io_statx {
+ struct file *file;
+ int dfd;
+ unsigned int mask;
+ unsigned int flags;
+ const char __user *filename;
+ struct statx __user *buffer;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -508,12 +522,12 @@ enum {
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+ REQ_F_LINK_HEAD_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
- REQ_F_IOPOLL_COMPLETED_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
@@ -524,6 +538,10 @@ enum {
REQ_F_OVERFLOW_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
+ REQ_F_NO_FILE_TABLE_BIT,
+ REQ_F_QUEUE_TIMEOUT_BIT,
+ REQ_F_WORK_INITIALIZED_BIT,
+ REQ_F_TASK_PINNED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -543,6 +561,8 @@ enum {
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
+ /* head of a link */
+ REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
@@ -553,8 +573,6 @@ enum {
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
- /* polled IO has completed */
- REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
/* has linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* timeout request */
@@ -575,6 +593,14 @@ enum {
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+ /* doesn't need file table for this request */
+ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
+ /* needs to queue linked timeout */
+ REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
+ /* io_wq_work is initialized */
+ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
+ /* req->task is refcounted */
+ REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT),
};
struct async_poll {
@@ -607,12 +633,16 @@ struct io_kiocb {
struct io_epoll epoll;
struct io_splice splice;
struct io_provide_buf pbuf;
+ struct io_statx statx;
};
struct io_async_ctx *io;
int cflags;
- bool needs_fixed_file;
u8 opcode;
+ /* polled IO has completed */
+ u8 iopoll_completed;
+
+ u16 buf_index;
struct io_ring_ctx *ctx;
struct list_head list;
@@ -675,8 +705,8 @@ struct io_op_def {
unsigned needs_mm : 1;
/* needs req->file assigned */
unsigned needs_file : 1;
- /* needs req->file assigned IFF fd is >= 0 */
- unsigned fd_non_neg : 1;
+ /* don't fail if file grab fails */
+ unsigned needs_file_no_error : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
@@ -779,13 +809,12 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
},
[IORING_OP_OPENAT] = {
- .needs_file = 1,
- .fd_non_neg = 1,
.file_table = 1,
.needs_fs = 1,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
+ .needs_file_no_error = 1,
.file_table = 1,
},
[IORING_OP_FILES_UPDATE] = {
@@ -794,9 +823,8 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_STATX] = {
.needs_mm = 1,
- .needs_file = 1,
- .fd_non_neg = 1,
.needs_fs = 1,
+ .file_table = 1,
},
[IORING_OP_READ] = {
.needs_mm = 1,
@@ -831,8 +859,6 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
},
[IORING_OP_OPENAT2] = {
- .needs_file = 1,
- .fd_non_neg = 1,
.file_table = 1,
.needs_fs = 1,
},
@@ -847,6 +873,11 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
+ [IORING_OP_TEE] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ },
};
static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -882,11 +913,46 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+static void io_get_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ return;
+ get_task_struct(req->task);
+ req->flags |= REQ_F_TASK_PINNED;
+}
+
+/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
+static void __io_put_req_task(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_TASK_PINNED)
+ put_task_struct(req->task);
+}
+
+static void io_file_put_work(struct work_struct *work);
+
+/*
+ * Note: must call io_req_init_async() for the first time you
+ * touch any members of io_wq_work.
+ */
+static inline void io_req_init_async(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ return;
+
+ memset(&req->work, 0, sizeof(req->work));
+ req->flags |= REQ_F_WORK_INITIALIZED;
+}
+
+static inline bool io_async_submit(struct io_ring_ctx *ctx)
+{
+ return ctx->flags & IORING_SETUP_SQPOLL;
+}
+
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
- complete(&ctx->completions[0]);
+ complete(&ctx->ref_comp);
}
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
@@ -902,10 +968,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
if (!ctx->fallback_req)
goto err;
- ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
- if (!ctx->completions)
- goto err;
-
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
@@ -926,10 +988,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
goto err;
ctx->flags = p->flags;
+ init_waitqueue_head(&ctx->sqo_wait);
init_waitqueue_head(&ctx->cq_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
- init_completion(&ctx->completions[0]);
- init_completion(&ctx->completions[1]);
+ init_completion(&ctx->ref_comp);
+ init_completion(&ctx->sq_thread_comp);
idr_init(&ctx->io_buffer_idr);
idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
@@ -941,11 +1004,12 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->inflight_wait);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
+ INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
+ init_llist_head(&ctx->file_put_llist);
return ctx;
err:
if (ctx->fallback_req)
kmem_cache_free(req_cachep, ctx->fallback_req);
- kfree(ctx->completions);
kfree(ctx->cancel_hash);
kfree(ctx);
return NULL;
@@ -955,8 +1019,8 @@ static inline bool __req_need_defer(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
- + atomic_read(&ctx->cached_cq_overflow);
+ return req->sequence != ctx->cached_cq_tail
+ + atomic_read(&ctx->cached_cq_overflow);
}
static inline bool req_need_defer(struct io_kiocb *req)
@@ -967,36 +1031,6 @@ static inline bool req_need_defer(struct io_kiocb *req)
return false;
}
-static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
-{
- struct io_kiocb *req;
-
- req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
- if (req && !req_need_defer(req)) {
- list_del_init(&req->list);
- return req;
- }
-
- return NULL;
-}
-
-static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
-{
- struct io_kiocb *req;
-
- req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
- if (req) {
- if (req->flags & REQ_F_TIMEOUT_NOSEQ)
- return NULL;
- if (!__req_need_defer(req)) {
- list_del_init(&req->list);
- return req;
- }
- }
-
- return NULL;
-}
-
static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
@@ -1029,12 +1063,13 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
}
spin_unlock(&current->fs->lock);
}
- if (!req->work.task_pid)
- req->work.task_pid = task_pid_vnr(current);
}
static inline void io_req_work_drop_env(struct io_kiocb *req)
{
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ return;
+
if (req->work.mm) {
mmdrop(req->work.mm);
req->work.mm = NULL;
@@ -1068,6 +1103,7 @@ static inline void io_prep_async_work(struct io_kiocb *req,
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
+ io_req_init_async(req);
io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req);
@@ -1112,17 +1148,43 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
-static void io_commit_cqring(struct io_ring_ctx *ctx)
+static void __io_queue_deferred(struct io_ring_ctx *ctx)
{
- struct io_kiocb *req;
+ do {
+ struct io_kiocb *req = list_first_entry(&ctx->defer_list,
+ struct io_kiocb, list);
+
+ if (req_need_defer(req))
+ break;
+ list_del_init(&req->list);
+ io_queue_async_work(req);
+ } while (!list_empty(&ctx->defer_list));
+}
- while ((req = io_get_timeout_req(ctx)) != NULL)
+static void io_flush_timeouts(struct io_ring_ctx *ctx)
+{
+ while (!list_empty(&ctx->timeout_list)) {
+ struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
+ struct io_kiocb, list);
+
+ if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+ break;
+ if (req->timeout.target_seq != ctx->cached_cq_tail
+ - atomic_read(&ctx->cq_timeouts))
+ break;
+
+ list_del_init(&req->list);
io_kill_timeout(req);
+ }
+}
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+ io_flush_timeouts(ctx);
__io_commit_cqring(ctx);
- while ((req = io_get_deferred_req(ctx)) != NULL)
- io_queue_async_work(req);
+ if (unlikely(!list_empty(&ctx->defer_list)))
+ __io_queue_deferred(ctx);
}
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
@@ -1147,6 +1209,8 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
if (!ctx->cq_ev_fd)
return false;
+ if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
+ return false;
if (!ctx->eventfd_async)
return true;
return io_wq_current_is_worker();
@@ -1289,7 +1353,7 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
struct io_kiocb *req;
req = ctx->fallback_req;
- if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
+ if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
return req;
return NULL;
@@ -1351,9 +1415,7 @@ static void __io_req_aux_free(struct io_kiocb *req)
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- if (req->task)
- put_task_struct(req->task);
-
+ __io_put_req_task(req);
io_req_work_drop_env(req);
}
@@ -1376,7 +1438,7 @@ static void __io_free_req(struct io_kiocb *req)
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
else
- clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
+ clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
}
struct req_batch {
@@ -1396,10 +1458,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
- if (req->flags & REQ_F_FIXED_FILE) {
- req->file = NULL;
- percpu_ref_put(req->fixed_file_refs);
- }
if (req->flags & REQ_F_INFLIGHT)
inflight++;
__io_req_aux_free(req);
@@ -1437,7 +1495,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
- req->flags &= ~REQ_F_LINK;
+ req->flags &= ~REQ_F_LINK_HEAD;
io_put_req(req);
return true;
}
@@ -1473,7 +1531,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
list_del_init(&req->link_list);
if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK;
+ nxt->flags |= REQ_F_LINK_HEAD;
*nxtptr = nxt;
break;
}
@@ -1484,7 +1542,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
}
/*
- * Called if REQ_F_LINK is set, and we fail the head request
+ * Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
static void io_fail_links(struct io_kiocb *req)
{
@@ -1517,7 +1575,7 @@ static void io_fail_links(struct io_kiocb *req)
static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
{
- if (likely(!(req->flags & REQ_F_LINK)))
+ if (likely(!(req->flags & REQ_F_LINK_HEAD)))
return;
/*
@@ -1557,16 +1615,6 @@ static void io_free_req(struct io_kiocb *req)
io_queue_async_work(nxt);
}
-static void io_link_work_cb(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *link;
-
- link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- io_queue_linked_timeout(link);
- io_wq_submit_work(workptr);
-}
-
static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
{
struct io_kiocb *link;
@@ -1578,7 +1626,7 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
*workptr = &nxt->work;
link = io_prep_linked_timeout(nxt);
if (link)
- nxt->work.func = io_link_work_cb;
+ nxt->flags |= REQ_F_QUEUE_TIMEOUT;
}
/*
@@ -1669,10 +1717,10 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
{
- if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
+ if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
return false;
- if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
+ if (req->file || req->io)
rb->need_iter++;
rb->reqs[rb->to_free++] = req;
@@ -1694,6 +1742,18 @@ static int io_put_kbuf(struct io_kiocb *req)
return cflags;
}
+static void io_iopoll_queue(struct list_head *again)
+{
+ struct io_kiocb *req;
+
+ do {
+ req = list_first_entry(again, struct io_kiocb, list);
+ list_del(&req->list);
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ } while (!list_empty(again));
+}
+
/*
* Find and free completed poll iocbs
*/
@@ -1702,12 +1762,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct req_batch rb;
struct io_kiocb *req;
+ LIST_HEAD(again);
+
+ /* order with ->result store in io_complete_rw_iopoll() */
+ smp_rmb();
rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) {
int cflags = 0;
req = list_first_entry(done, struct io_kiocb, list);
+ if (READ_ONCE(req->result) == -EAGAIN) {
+ req->iopoll_completed = 0;
+ list_move_tail(&req->list, &again);
+ continue;
+ }
list_del(&req->list);
if (req->flags & REQ_F_BUFFER_SELECTED)
@@ -1725,18 +1794,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
-}
-static void io_iopoll_queue(struct list_head *again)
-{
- struct io_kiocb *req;
-
- do {
- req = list_first_entry(again, struct io_kiocb, list);
- list_del(&req->list);
- refcount_inc(&req->refs);
- io_queue_async_work(req);
- } while (!list_empty(again));
+ if (!list_empty(&again))
+ io_iopoll_queue(&again);
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
@@ -1744,7 +1804,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
- LIST_HEAD(again);
bool spin;
int ret;
@@ -1763,20 +1822,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
- if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+ if (READ_ONCE(req->iopoll_completed)) {
list_move_tail(&req->list, &done);
continue;
}
if (!list_empty(&done))
break;
- if (req->result == -EAGAIN) {
- list_move_tail(&req->list, &again);
- continue;
- }
- if (!list_empty(&again))
- break;
-
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
if (ret < 0)
break;
@@ -1789,9 +1841,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
- if (!list_empty(&again))
- io_iopoll_queue(&again);
-
return ret;
}
@@ -1940,11 +1989,15 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if (res != req->result)
+ if (res != -EAGAIN && res != req->result)
req_set_fail_links(req);
- req->result = res;
- if (res != -EAGAIN)
- req->flags |= REQ_F_IOPOLL_COMPLETED;
+
+ WRITE_ONCE(req->result, res);
+ /* order with io_poll_complete() checking ->result */
+ if (res != -EAGAIN) {
+ smp_wmb();
+ WRITE_ONCE(req->iopoll_completed, 1);
+ }
}
/*
@@ -1977,7 +2030,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
* For fast devices, IO may have already completed. If it has, add
* it to the front so we find it first.
*/
- if (req->flags & REQ_F_IOPOLL_COMPLETED)
+ if (READ_ONCE(req->iopoll_completed))
list_add(&req->list, &ctx->poll_list);
else
list_add_tail(&req->list, &ctx->poll_list);
@@ -1987,15 +2040,19 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
wake_up(&ctx->sqo_wait);
}
-static void io_file_put(struct io_submit_state *state)
+static void __io_state_file_put(struct io_submit_state *state)
{
- if (state->file) {
- int diff = state->has_refs - state->used_refs;
+ int diff = state->has_refs - state->used_refs;
- if (diff)
- fput_many(state->file, diff);
- state->file = NULL;
- }
+ if (diff)
+ fput_many(state->file, diff);
+ state->file = NULL;
+}
+
+static inline void io_state_file_put(struct io_submit_state *state)
+{
+ if (state->file)
+ __io_state_file_put(state);
}
/*
@@ -2014,7 +2071,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
state->ios_left--;
return state->file;
}
- io_file_put(state);
+ __io_state_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
if (!state->file)
@@ -2032,7 +2089,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
-static bool io_file_supports_async(struct file *file)
+static bool io_file_supports_async(struct file *file, int rw)
{
umode_t mode = file_inode(file)->i_mode;
@@ -2041,7 +2098,17 @@ static bool io_file_supports_async(struct file *file)
if (S_ISREG(mode) && file->f_op != &io_uring_fops)
return true;
- return false;
+ /* any ->read/write should understand O_NONBLOCK */
+ if (file->f_flags & O_NONBLOCK)
+ return true;
+
+ if (!(file->f_mode & FMODE_NOWAIT))
+ return false;
+
+ if (rw == READ)
+ return file->f_op->read_iter != NULL;
+
+ return file->f_op->write_iter != NULL;
}
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
@@ -2077,8 +2144,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_ioprio = get_current_ioprio();
/* don't allow async punt if RWF_NOWAIT was requested */
- if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- (req->file->f_flags & O_NONBLOCK))
+ if (kiocb->ki_flags & IOCB_NOWAIT)
req->flags |= REQ_F_NOWAIT;
if (force_nonblock)
@@ -2092,6 +2158,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->result = 0;
+ req->iopoll_completed = 0;
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -2100,9 +2167,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
- /* we own ->private, reuse it for the buffer index / buffer ID */
- req->rw.kiocb.private = (void *) (unsigned long)
- READ_ONCE(sqe->buf_index);
+ req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -2145,7 +2210,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
struct io_ring_ctx *ctx = req->ctx;
size_t len = req->rw.len;
struct io_mapped_ubuf *imu;
- unsigned index, buf_index;
+ u16 index, buf_index;
size_t offset;
u64 buf_addr;
@@ -2153,7 +2218,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
if (unlikely(!ctx->user_bufs))
return -EFAULT;
- buf_index = (unsigned long) req->rw.kiocb.private;
+ buf_index = req->buf_index;
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
@@ -2269,10 +2334,10 @@ static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
bool needs_lock)
{
struct io_buffer *kbuf;
- int bgid;
+ u16 bgid;
kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
- bgid = (int) (unsigned long) req->rw.kiocb.private;
+ bgid = req->buf_index;
kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
if (IS_ERR(kbuf))
return kbuf;
@@ -2332,8 +2397,14 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
bool needs_lock)
{
- if (req->flags & REQ_F_BUFFER_SELECTED)
+ if (req->flags & REQ_F_BUFFER_SELECTED) {
+ struct io_buffer *kbuf;
+
+ kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+ iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+ iov[0].iov_len = kbuf->len;
return 0;
+ }
if (!req->rw.len)
return 0;
else if (req->rw.len > 1)
@@ -2363,7 +2434,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
}
/* buffer index only valid with fixed read/write, or buffer select */
- if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT))
+ if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
@@ -2562,14 +2633,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
req->result = 0;
io_size = ret;
- if (req->flags & REQ_F_LINK)
+ if (req->flags & REQ_F_LINK_HEAD)
req->result = io_size;
/*
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
- if (force_nonblock && !io_file_supports_async(req->file))
+ if (force_nonblock && !io_file_supports_async(req->file, READ))
goto copy_iov;
iov_count = iov_iter_count(&iter);
@@ -2592,14 +2663,15 @@ copy_iov:
if (ret)
goto out_free;
/* any defer here is final, must blocking retry */
- if (!(req->flags & REQ_F_NOWAIT))
+ if (!(req->flags & REQ_F_NOWAIT) &&
+ !file_can_poll(req->file))
req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
}
out_free:
- kfree(iovec);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (!(req->flags & REQ_F_NEED_CLEANUP))
+ kfree(iovec);
return ret;
}
@@ -2653,14 +2725,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
req->result = 0;
io_size = ret;
- if (req->flags & REQ_F_LINK)
+ if (req->flags & REQ_F_LINK_HEAD)
req->result = io_size;
/*
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
- if (force_nonblock && !io_file_supports_async(req->file))
+ if (force_nonblock && !io_file_supports_async(req->file, WRITE))
goto copy_iov;
/* file path doesn't support NOWAIT for non-direct_IO */
@@ -2714,17 +2786,20 @@ copy_iov:
if (ret)
goto out_free;
/* any defer here is final, must blocking retry */
- req->flags |= REQ_F_MUST_PUNT;
+ if (!(req->flags & REQ_F_NOWAIT) &&
+ !file_can_poll(req->file))
+ req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
}
out_free:
- req->flags &= ~REQ_F_NEED_CLEANUP;
- kfree(iovec);
+ if (!(req->flags & REQ_F_NEED_CLEANUP))
+ kfree(iovec);
return ret;
}
-static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_splice_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
struct io_splice* sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
@@ -2732,10 +2807,10 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
sp->file_in = NULL;
- sp->off_in = READ_ONCE(sqe->splice_off_in);
- sp->off_out = READ_ONCE(sqe->off);
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
@@ -2748,19 +2823,56 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
req->flags |= REQ_F_NEED_CLEANUP;
- if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+ if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
+ /*
+ * Splice operation will be punted aync, and here need to
+ * modify io_wq_work.flags, so initialize io_wq_work firstly.
+ */
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_UNBOUND;
+ }
return 0;
}
-static bool io_splice_punt(struct file *file)
+static int io_tee_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
- if (get_pipe_info(file))
- return false;
- if (!io_file_supports_async(file))
- return true;
- return !(file->f_mode & O_NONBLOCK);
+ if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
+ return -EINVAL;
+ return __io_splice_prep(req, sqe);
+}
+
+static int io_tee(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_splice *sp = &req->splice;
+ struct file *in = sp->file_in;
+ struct file *out = sp->file_out;
+ unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+ long ret = 0;
+
+ if (force_nonblock)
+ return -EAGAIN;
+ if (sp->len)
+ ret = do_tee(in, out, sp->len, flags);
+
+ io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+
+ io_cqring_add_event(req, ret);
+ if (ret != sp->len)
+ req_set_fail_links(req);
+ io_put_req(req);
+ return 0;
+}
+
+static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_splice* sp = &req->splice;
+
+ sp->off_in = READ_ONCE(sqe->splice_off_in);
+ sp->off_out = READ_ONCE(sqe->off);
+ return __io_splice_prep(req, sqe);
}
static int io_splice(struct io_kiocb *req, bool force_nonblock)
@@ -2770,19 +2882,16 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
loff_t *poff_in, *poff_out;
- long ret;
+ long ret = 0;
- if (force_nonblock) {
- if (io_splice_punt(in) || io_splice_punt(out))
- return -EAGAIN;
- flags |= SPLICE_F_NONBLOCK;
- }
+ if (force_nonblock)
+ return -EAGAIN;
poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
- ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
+
+ if (sp->len)
+ ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -2830,23 +2939,15 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static bool io_req_cancelled(struct io_kiocb *req)
-{
- if (req->work.flags & IO_WQ_WORK_CANCEL) {
- req_set_fail_links(req);
- io_cqring_add_event(req, -ECANCELED);
- io_put_req(req);
- return true;
- }
-
- return false;
-}
-
-static void __io_fsync(struct io_kiocb *req)
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
{
loff_t end = req->sync.off + req->sync.len;
int ret;
+ /* fsync always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
@@ -2854,58 +2955,16 @@ static void __io_fsync(struct io_kiocb *req)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_fsync(req);
- io_steal_work(req, workptr);
-}
-
-static int io_fsync(struct io_kiocb *req, bool force_nonblock)
-{
- /* fsync always requires a blocking context */
- if (force_nonblock) {
- req->work.func = io_fsync_finish;
- return -EAGAIN;
- }
- __io_fsync(req);
return 0;
}
-static void __io_fallocate(struct io_kiocb *req)
-{
- int ret;
-
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
- ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
- req->sync.len);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
-}
-
-static void io_fallocate_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_fallocate(req);
- io_steal_work(req, workptr);
-}
-
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
@@ -2916,66 +2975,74 @@ static int io_fallocate_prep(struct io_kiocb *req,
static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
{
+ int ret;
+
/* fallocate always requiring blocking context */
- if (force_nonblock) {
- req->work.func = io_fallocate_finish;
+ if (force_nonblock)
return -EAGAIN;
- }
- __io_fallocate(req);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+ ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
+ req->sync.len);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req(req);
return 0;
}
-static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
const char __user *fname;
int ret;
- if (sqe->ioprio || sqe->buf_index)
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
+ if (unlikely(sqe->ioprio || sqe->buf_index))
+ return -EINVAL;
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
- req->open.dfd = READ_ONCE(sqe->fd);
- req->open.how.mode = READ_ONCE(sqe->len);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->open.how.flags = READ_ONCE(sqe->open_flags);
- if (force_o_largefile())
+ /* open.how should be already initialised */
+ if (!(req->open.how.flags & O_PATH) && force_o_largefile())
req->open.how.flags |= O_LARGEFILE;
+ req->open.dfd = READ_ONCE(sqe->fd);
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
ret = PTR_ERR(req->open.filename);
req->open.filename = NULL;
return ret;
}
-
req->open.nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
+static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ u64 flags, mode;
+
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
+ mode = READ_ONCE(sqe->len);
+ flags = READ_ONCE(sqe->open_flags);
+ req->open.how = build_open_how(flags, mode);
+ return __io_openat_prep(req, sqe);
+}
+
static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct open_how __user *how;
- const char __user *fname;
size_t len;
int ret;
- if (sqe->ioprio || sqe->buf_index)
- return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
- return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
-
- req->open.dfd = READ_ONCE(sqe->fd);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
-
if (len < OPEN_HOW_SIZE_VER0)
return -EINVAL;
@@ -2984,19 +3051,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (ret)
return ret;
- if (!(req->open.how.flags & O_PATH) && force_o_largefile())
- req->open.how.flags |= O_LARGEFILE;
-
- req->open.filename = getname(fname);
- if (IS_ERR(req->open.filename)) {
- ret = PTR_ERR(req->open.filename);
- req->open.filename = NULL;
- return ret;
- }
-
- req->open.nofile = rlimit(RLIMIT_NOFILE);
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
+ return __io_openat_prep(req, sqe);
}
static int io_openat2(struct io_kiocb *req, bool force_nonblock)
@@ -3036,7 +3091,6 @@ err:
static int io_openat(struct io_kiocb *req, bool force_nonblock)
{
- req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
return io_openat2(req, force_nonblock);
}
@@ -3125,7 +3179,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
p->addr = READ_ONCE(sqe->addr);
p->len = READ_ONCE(sqe->len);
- if (!access_ok(u64_to_user_ptr(p->addr), p->len))
+ if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group);
@@ -3203,6 +3257,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#if defined(CONFIG_EPOLL)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
req->epoll.op = READ_ONCE(sqe->len);
@@ -3247,6 +3303,8 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
if (sqe->ioprio || sqe->buf_index || sqe->off)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->madvise.addr = READ_ONCE(sqe->addr);
req->madvise.len = READ_ONCE(sqe->len);
@@ -3281,6 +3339,8 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (sqe->ioprio || sqe->buf_index || sqe->addr)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->fadvise.offset = READ_ONCE(sqe->off);
req->fadvise.len = READ_ONCE(sqe->len);
@@ -3314,71 +3374,37 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- const char __user *fname;
- unsigned lookup_flags;
- int ret;
-
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
-
- req->open.dfd = READ_ONCE(sqe->fd);
- req->open.mask = READ_ONCE(sqe->len);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- req->open.how.flags = READ_ONCE(sqe->statx_flags);
- if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
- return -EINVAL;
+ req->statx.dfd = READ_ONCE(sqe->fd);
+ req->statx.mask = READ_ONCE(sqe->len);
+ req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ req->statx.flags = READ_ONCE(sqe->statx_flags);
- req->open.filename = getname_flags(fname, lookup_flags, NULL);
- if (IS_ERR(req->open.filename)) {
- ret = PTR_ERR(req->open.filename);
- req->open.filename = NULL;
- return ret;
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
static int io_statx(struct io_kiocb *req, bool force_nonblock)
{
- struct io_open *ctx = &req->open;
- unsigned lookup_flags;
- struct path path;
- struct kstat stat;
+ struct io_statx *ctx = &req->statx;
int ret;
- if (force_nonblock)
+ if (force_nonblock) {
+ /* only need file table for an actual valid fd */
+ if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
+ req->flags |= REQ_F_NO_FILE_TABLE;
return -EAGAIN;
+ }
- if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
- return -EINVAL;
-
-retry:
- /* filename_lookup() drops it, keep a reference */
- ctx->filename->refcnt++;
-
- ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
- NULL);
- if (ret)
- goto err;
+ ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
+ ctx->buffer);
- ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
- path_put(&path);
- if (retry_estale(ret, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- if (!ret)
- ret = cp_statx(&stat, ctx->buffer);
-err:
- putname(ctx->filename);
- req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
@@ -3390,10 +3416,14 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
/*
* If we queue this for async, it must not be cancellable. That would
- * leave the 'file' in an undeterminate state.
+ * leave the 'file' in an undeterminate state, and here need to modify
+ * io_wq_work.flags, so initialize io_wq_work firstly.
*/
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_NO_CANCEL;
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
@@ -3401,64 +3431,41 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
- if (req->file->f_op == &io_uring_fops ||
+ if ((req->file && req->file->f_op == &io_uring_fops) ||
req->close.fd == req->ctx->ring_fd)
return -EBADF;
+ req->close.put_file = NULL;
return 0;
}
-/* only called when __close_fd_get_file() is done */
-static void __io_close_finish(struct io_kiocb *req)
-{
- int ret;
-
- ret = filp_close(req->close.put_file, req->work.files);
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- fput(req->close.put_file);
- io_put_req(req);
-}
-
-static void io_close_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- /* not cancellable, don't do io_req_cancelled() */
- __io_close_finish(req);
- io_steal_work(req, workptr);
-}
-
static int io_close(struct io_kiocb *req, bool force_nonblock)
{
+ struct io_close *close = &req->close;
int ret;
- req->close.put_file = NULL;
- ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
- if (ret < 0)
- return ret;
+ /* might be already done during nonblock submission */
+ if (!close->put_file) {
+ ret = __close_fd_get_file(close->fd, &close->put_file);
+ if (ret < 0)
+ return (ret == -ENOENT) ? -EBADF : ret;
+ }
/* if the file has a flush method, be safe and punt to async */
- if (req->close.put_file->f_op->flush && force_nonblock) {
- /* submission ref will be dropped, take it for async */
- refcount_inc(&req->refs);
-
- req->work.func = io_close_finish;
- /*
- * Do manual async queue here to avoid grabbing files - we don't
- * need the files, and it'll cause io_close_finish() to close
- * the file again and cause a double CQE entry for this request
- */
- io_queue_async_work(req);
- return 0;
+ if (close->put_file->f_op->flush && force_nonblock) {
+ /* avoid grabbing files - we don't need the files */
+ req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
+ return -EAGAIN;
}
- /*
- * No ->flush(), safely close from here and just punt the
- * fput() to async context.
- */
- __io_close_finish(req);
+ /* No ->flush() or already async, safely close from here */
+ ret = filp_close(close->put_file, req->work.files);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ fput(close->put_file);
+ close->put_file = NULL;
+ io_put_req(req);
return 0;
}
@@ -3480,38 +3487,20 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static void __io_sync_file_range(struct io_kiocb *req)
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
{
int ret;
+ /* sync_file_range always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
-}
-
-
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_sync_file_range(req);
- io_put_req(req); /* put submission ref */
-}
-
-static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
-{
- /* sync_file_range always requires a blocking context */
- if (force_nonblock) {
- req->work.func = io_sync_file_range_finish;
- return -EAGAIN;
- }
-
- __io_sync_file_range(req);
return 0;
}
@@ -3537,6 +3526,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_async_ctx *io = req->io;
int ret;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -3566,9 +3558,6 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
struct socket *sock;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_async_ctx io;
@@ -3622,9 +3611,6 @@ static int io_send(struct io_kiocb *req, bool force_nonblock)
struct socket *sock;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
@@ -3777,6 +3763,9 @@ static int io_recvmsg_prep(struct io_kiocb *req,
struct io_async_ctx *io = req->io;
int ret;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
@@ -3805,9 +3794,6 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
struct socket *sock;
int ret, cflags = 0;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_buffer *kbuf;
@@ -3869,9 +3855,6 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock)
struct socket *sock;
int ret, cflags = 0;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
@@ -3939,49 +3922,30 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int __io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
{
struct io_accept *accept = &req->accept;
- unsigned file_flags;
+ unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
int ret;
- file_flags = force_nonblock ? O_NONBLOCK : 0;
+ if (req->file->f_flags & O_NONBLOCK)
+ req->flags |= REQ_F_NOWAIT;
+
ret = __sys_accept4_file(req->file, file_flags, accept->addr,
accept->addr_len, accept->flags,
accept->nofile);
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
req_set_fail_links(req);
+ }
io_cqring_add_event(req, ret);
io_put_req(req);
return 0;
}
-static void io_accept_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_accept(req, false);
- io_steal_work(req, workptr);
-}
-
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
-{
- int ret;
-
- ret = __io_accept(req, force_nonblock);
- if (ret == -EAGAIN && force_nonblock) {
- req->work.func = io_accept_finish;
- return -EAGAIN;
- }
- return 0;
-}
-
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
@@ -4101,27 +4065,6 @@ struct io_poll_table {
int error;
};
-static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
- struct wait_queue_head *head)
-{
- if (unlikely(poll->head)) {
- pt->error = -EINVAL;
- return;
- }
-
- pt->error = 0;
- poll->head = head;
- add_wait_queue(head, &poll->wait);
-}
-
-static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
- struct poll_table_struct *p)
-{
- struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-
- __io_queue_proc(&pt->req->apoll->poll, pt, head);
-}
-
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
@@ -4140,12 +4083,14 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
req->result = mask;
init_task_work(&req->task_work, func);
/*
- * If this fails, then the task is exiting. Punt to one of the io-wq
- * threads to ensure the work gets run, we can't always rely on exit
- * cancelation taking care of this.
+ * If this fails, then the task is exiting. When a task exits, the
+ * work gets canceled, so just cancel this request as well instead
+ * of executing it. We can't safely execute it anyway, as we may not
+ * have the needed state needed for it anyway.
*/
ret = task_work_add(tsk, &req->task_work, true);
if (unlikely(ret)) {
+ WRITE_ONCE(poll->canceled, true);
tsk = io_wq_get_task(req->ctx->io_wq);
task_work_add(tsk, &req->task_work, true);
}
@@ -4153,31 +4098,233 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
return 1;
}
+static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
+ __acquires(&req->ctx->completion_lock)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ struct poll_table_struct pt = { ._key = poll->events };
+
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
+
+ spin_lock_irq(&ctx->completion_lock);
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ return true;
+ }
+
+ return false;
+}
+
+static void io_poll_remove_double(struct io_kiocb *req)
+{
+ struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+
+ lockdep_assert_held(&req->ctx->completion_lock);
+
+ if (poll && poll->head) {
+ struct wait_queue_head *head = poll->head;
+
+ spin_lock(&head->lock);
+ list_del_init(&poll->wait.entry);
+ if (poll->wait.private)
+ refcount_dec(&req->refs);
+ poll->head = NULL;
+ spin_unlock(&head->lock);
+ }
+}
+
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ io_poll_remove_double(req);
+ req->poll.done = true;
+ io_cqring_fill_event(req, error ? error : mangle_poll(mask));
+ io_commit_cqring(ctx);
+}
+
+static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (io_poll_rewait(req, &req->poll)) {
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
+
+ hash_del(&req->hash_node);
+ io_poll_complete(req, req->result, 0);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req_find_next(req, nxt);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+}
+
+static void io_poll_task_func(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_kiocb *nxt = NULL;
+
+ io_poll_task_handler(req, &nxt);
+ if (nxt) {
+ struct io_ring_ctx *ctx = nxt->ctx;
+
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(nxt, NULL);
+ mutex_unlock(&ctx->uring_lock);
+ }
+}
+
+static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct io_kiocb *req = wait->private;
+ struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ __poll_t mask = key_to_poll(key);
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & poll->events))
+ return 0;
+
+ if (req->poll.head) {
+ bool done;
+
+ spin_lock(&req->poll.head->lock);
+ done = list_empty(&req->poll.wait.entry);
+ if (!done)
+ list_del_init(&req->poll.wait.entry);
+ spin_unlock(&req->poll.head->lock);
+ if (!done)
+ __io_async_wake(req, poll, mask, io_poll_task_func);
+ }
+ refcount_dec(&req->refs);
+ return 1;
+}
+
+static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
+ wait_queue_func_t wake_func)
+{
+ poll->head = NULL;
+ poll->done = false;
+ poll->canceled = false;
+ poll->events = events;
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, wake_func);
+}
+
+static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
+ struct wait_queue_head *head)
+{
+ struct io_kiocb *req = pt->req;
+
+ /*
+ * If poll->head is already set, it's because the file being polled
+ * uses multiple waitqueues for poll handling (eg one for read, one
+ * for write). Setup a separate io_poll_iocb if this happens.
+ */
+ if (unlikely(poll->head)) {
+ /* already have a 2nd entry, fail a third attempt */
+ if (req->io) {
+ pt->error = -EINVAL;
+ return;
+ }
+ poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
+ if (!poll) {
+ pt->error = -ENOMEM;
+ return;
+ }
+ io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
+ refcount_inc(&req->refs);
+ poll->wait.private = req;
+ req->io = (void *) poll;
+ }
+
+ pt->error = 0;
+ poll->head = head;
+ add_wait_queue(head, &poll->wait);
+}
+
+static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+ __io_queue_proc(&pt->req->apoll->poll, pt, head);
+}
+
+static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ kthread_unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+ if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ kthread_use_mm(ctx->sqo_mm);
+ }
+
+ return 0;
+}
+
static void io_async_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
+ bool canceled = false;
trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
- WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
+ if (io_poll_rewait(req, &apoll->poll)) {
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
+ /* If req is still hashed, it cannot have been canceled. Don't check. */
if (hash_hashed(&req->hash_node)) {
- spin_lock_irq(&ctx->completion_lock);
hash_del(&req->hash_node);
- spin_unlock_irq(&ctx->completion_lock);
+ } else {
+ canceled = READ_ONCE(apoll->poll.canceled);
+ if (canceled) {
+ io_cqring_fill_event(req, -ECANCELED);
+ io_commit_cqring(ctx);
+ }
}
- /* restore ->work in case we need to retry again */
- memcpy(&req->work, &apoll->work, sizeof(req->work));
-
- __set_current_state(TASK_RUNNING);
- mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(req, NULL);
- mutex_unlock(&ctx->uring_lock);
+ spin_unlock_irq(&ctx->completion_lock);
+ /* restore ->work in case we need to retry again */
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
kfree(apoll);
+
+ if (!canceled) {
+ __set_current_state(TASK_RUNNING);
+ if (io_sq_thread_acquire_mm(ctx, req)) {
+ io_cqring_add_event(req, -EFAULT);
+ goto end_req;
+ }
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(req, NULL);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ io_cqring_ev_posted(ctx);
+end_req:
+ req_set_fail_links(req);
+ io_double_put_req(req);
+ }
}
static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -4211,18 +4358,13 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
bool cancel = false;
poll->file = req->file;
- poll->head = NULL;
- poll->done = poll->canceled = false;
- poll->events = mask;
+ io_init_poll_iocb(poll, mask, wake_func);
+ poll->wait.private = req;
ipt->pt._key = mask;
ipt->req = req;
ipt->error = -EINVAL;
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, wake_func);
- poll->wait.private = req;
-
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
spin_lock_irq(&ctx->completion_lock);
@@ -4253,6 +4395,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask, ret;
+ bool had_io;
if (!req->file || !file_can_poll(req->file))
return false;
@@ -4266,10 +4409,11 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
return false;
req->flags |= REQ_F_POLLED;
- memcpy(&apoll->work, &req->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&apoll->work, &req->work, sizeof(req->work));
+ had_io = req->io != NULL;
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4286,9 +4430,12 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
io_async_wake);
if (ret) {
ipt.error = 0;
- apoll->poll.done = true;
+ /* only remove double add if we did it here */
+ if (!had_io)
+ io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
kfree(apoll);
return false;
}
@@ -4310,6 +4457,7 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
do_complete = true;
}
spin_unlock(&poll->head->lock);
+ hash_del(&req->hash_node);
return do_complete;
}
@@ -4318,16 +4466,27 @@ static bool io_poll_remove_one(struct io_kiocb *req)
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
+ io_poll_remove_double(req);
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
+ struct async_poll *apoll = req->apoll;
+
/* non-poll requests have submit ref still */
- do_complete = __io_poll_remove_one(req, &req->apoll->poll);
- if (do_complete)
+ do_complete = __io_poll_remove_one(req, &apoll->poll);
+ if (do_complete) {
io_put_req(req);
+ /*
+ * restore ->work because we will call
+ * io_req_work_drop_env below when dropping the
+ * final reference.
+ */
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work,
+ sizeof(req->work));
+ kfree(apoll);
+ }
}
- hash_del(&req->hash_node);
-
if (do_complete) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
@@ -4342,7 +4501,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
{
struct hlist_node *tmp;
struct io_kiocb *req;
- int i;
+ int posted = 0, i;
spin_lock_irq(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
@@ -4350,11 +4509,12 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node)
- io_poll_remove_one(req);
+ posted += io_poll_remove_one(req);
}
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
+ if (posted)
+ io_cqring_ev_posted(ctx);
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -4409,56 +4569,6 @@ static int io_poll_remove(struct io_kiocb *req)
return 0;
}
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- req->poll.done = true;
- io_cqring_fill_event(req, error ? error : mangle_poll(mask));
- io_commit_cqring(ctx);
-}
-
-static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_poll_iocb *poll = &req->poll;
-
- if (!req->result && !READ_ONCE(poll->canceled)) {
- struct poll_table_struct pt = { ._key = poll->events };
-
- req->result = vfs_poll(req->file, &pt) & poll->events;
- }
-
- spin_lock_irq(&ctx->completion_lock);
- if (!req->result && !READ_ONCE(poll->canceled)) {
- add_wait_queue(poll->head, &poll->wait);
- spin_unlock_irq(&ctx->completion_lock);
- return;
- }
- hash_del(&req->hash_node);
- io_poll_complete(req, req->result, 0);
- req->flags |= REQ_F_COMP_LOCKED;
- io_put_req_find_next(req, nxt);
- spin_unlock_irq(&ctx->completion_lock);
-
- io_cqring_ev_posted(ctx);
-}
-
-static void io_poll_task_func(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- struct io_kiocb *nxt = NULL;
-
- io_poll_task_handler(req, &nxt);
- if (nxt) {
- struct io_ring_ctx *ctx = nxt->ctx;
-
- mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(nxt, NULL);
- mutex_unlock(&ctx->uring_lock);
- }
-}
-
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
@@ -4491,8 +4601,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- get_task_struct(current);
- req->task = current;
+ io_get_req_task(req);
return 0;
}
@@ -4538,20 +4647,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
* We could be racing with timeout deletion. If the list is empty,
* then timeout lookup already found it and will be handling it.
*/
- if (!list_empty(&req->list)) {
- struct io_kiocb *prev;
-
- /*
- * Adjust the reqs sequence before the current one because it
- * will consume a slot in the cq_ring and the cq_tail
- * pointer will be increased, otherwise other timeout reqs may
- * return in advance without waiting for enough wait_nr.
- */
- prev = req;
- list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
- prev->sequence++;
+ if (!list_empty(&req->list))
list_del_init(&req->list);
- }
io_cqring_fill_event(req, -ETIME);
io_commit_cqring(ctx);
@@ -4631,18 +4728,19 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
{
struct io_timeout_data *data;
unsigned flags;
+ u32 off = READ_ONCE(sqe->off);
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
return -EINVAL;
- if (sqe->off && is_timeout_link)
+ if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~IORING_TIMEOUT_ABS)
return -EINVAL;
- req->timeout.count = READ_ONCE(sqe->off);
+ req->timeout.off = off;
if (!req->io && io_alloc_async_ctx(req))
return -ENOMEM;
@@ -4665,70 +4763,40 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static int io_timeout(struct io_kiocb *req)
{
- unsigned count;
struct io_ring_ctx *ctx = req->ctx;
- struct io_timeout_data *data;
+ struct io_timeout_data *data = &req->io->timeout;
struct list_head *entry;
- unsigned span = 0;
+ u32 tail, off = req->timeout.off;
- data = &req->io->timeout;
+ spin_lock_irq(&ctx->completion_lock);
/*
* sqe->off holds how many events that need to occur for this
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
- count = req->timeout.count;
- if (!count) {
+ if (!off) {
req->flags |= REQ_F_TIMEOUT_NOSEQ;
- spin_lock_irq(&ctx->completion_lock);
entry = ctx->timeout_list.prev;
goto add;
}
- req->sequence = ctx->cached_sq_head + count - 1;
- data->seq_offset = count;
+ tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+ req->timeout.target_seq = tail + off;
/*
* Insertion sort, ensuring the first entry in the list is always
* the one we need first.
*/
- spin_lock_irq(&ctx->completion_lock);
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
- unsigned nxt_sq_head;
- long long tmp, tmp_nxt;
- u32 nxt_offset = nxt->io->timeout.seq_offset;
if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
continue;
-
- /*
- * Since cached_sq_head + count - 1 can overflow, use type long
- * long to store it.
- */
- tmp = (long long)ctx->cached_sq_head + count - 1;
- nxt_sq_head = nxt->sequence - nxt_offset + 1;
- tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
-
- /*
- * cached_sq_head may overflow, and it will never overflow twice
- * once there is some timeout req still be valid.
- */
- if (ctx->cached_sq_head < nxt_sq_head)
- tmp += UINT_MAX;
-
- if (tmp > tmp_nxt)
+ /* nxt.seq is behind @tail, otherwise would've been completed */
+ if (off >= nxt->timeout.target_seq - tail)
break;
-
- /*
- * Sequence of reqs after the insert one and itself should
- * be adjusted because each timeout req consumes a slot.
- */
- span++;
- nxt->sequence++;
}
- req->sequence -= span;
add:
list_add(&req->list, entry);
data->timer.function = io_timeout_fn;
@@ -4749,7 +4817,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
enum io_wq_cancel cancel_ret;
int ret = 0;
- cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
+ cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -4862,6 +4930,8 @@ static int io_req_defer_prep(struct io_kiocb *req,
if (!sqe)
return 0;
+ io_req_init_async(req);
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (unlikely(ret))
@@ -4957,6 +5027,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
case IORING_OP_REMOVE_BUFFERS:
ret = io_remove_buffers_prep(req, sqe);
break;
+ case IORING_OP_TEE:
+ ret = io_tee_prep(req, sqe);
+ break;
default:
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
@@ -4973,15 +5046,16 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int ret;
/* Still need defer if there is pending req in defer list. */
- if (!req_need_defer(req) && list_empty(&ctx->defer_list))
+ if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
return 0;
- if (!req->io && io_alloc_async_ctx(req))
- return -EAGAIN;
-
- ret = io_req_defer_prep(req, sqe);
- if (ret < 0)
- return ret;
+ if (!req->io) {
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
+ ret = io_req_defer_prep(req, sqe);
+ if (ret < 0)
+ return ret;
+ }
spin_lock_irq(&ctx->completion_lock);
if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
@@ -5026,10 +5100,9 @@ static void io_cleanup_req(struct io_kiocb *req)
break;
case IORING_OP_OPENAT:
case IORING_OP_OPENAT2:
- case IORING_OP_STATX:
- putname(req->open.filename);
break;
case IORING_OP_SPLICE:
+ case IORING_OP_TEE:
io_put_file(req, req->splice.file_in,
(req->splice.flags & SPLICE_F_FD_IN_FIXED));
break;
@@ -5260,6 +5333,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
ret = io_remove_buffers(req, force_nonblock);
break;
+ case IORING_OP_TEE:
+ if (sqe) {
+ ret = io_tee_prep(req, sqe);
+ if (ret < 0)
+ break;
+ }
+ ret = io_tee(req, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
@@ -5268,7 +5349,8 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
return ret;
- if (ctx->flags & IORING_SETUP_IOPOLL) {
+ /* If the op doesn't have a file, we're not polling for it */
+ if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
const bool in_async = io_wq_current_is_worker();
if (req->result == -EAGAIN)
@@ -5287,12 +5369,26 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
+static void io_arm_async_linked_timeout(struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+
+ /* link head's timeout is queued in io_queue_async_work() */
+ if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
+ return;
+
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ io_queue_linked_timeout(link);
+}
+
static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
int ret = 0;
+ io_arm_async_linked_timeout(req);
+
/* if NO_CANCEL is set, we must still run the work */
if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
IO_WQ_WORK_CANCEL) {
@@ -5322,22 +5418,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
io_steal_work(req, workptr);
}
-static int io_req_needs_file(struct io_kiocb *req, int fd)
-{
- if (!io_op_defs[req->opcode].needs_file)
- return 0;
- if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg)
- return 0;
- return 1;
-}
-
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
int index)
{
struct fixed_file_table *table;
table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
- return table->files[index & IORING_FILE_TABLE_MASK];;
+ return table->files[index & IORING_FILE_TABLE_MASK];
}
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
@@ -5352,31 +5439,29 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
file = io_file_from_index(ctx, fd);
- if (!file)
- return -EBADF;
- req->fixed_file_refs = ctx->file_data->cur_refs;
- percpu_ref_get(req->fixed_file_refs);
+ if (file) {
+ req->fixed_file_refs = ctx->file_data->cur_refs;
+ percpu_ref_get(req->fixed_file_refs);
+ }
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
- if (unlikely(!file))
- return -EBADF;
}
- *out_file = file;
- return 0;
+ if (file || io_op_defs[req->opcode].needs_file_no_error) {
+ *out_file = file;
+ return 0;
+ }
+ return -EBADF;
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- int fd, unsigned int flags)
+ int fd)
{
bool fixed;
- if (!io_req_needs_file(req, fd))
- return 0;
-
- fixed = (flags & IOSQE_FIXED_FILE);
- if (unlikely(!fixed && req->needs_fixed_file))
+ fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
+ if (unlikely(!fixed && io_async_submit(req->ctx)))
return -EBADF;
return io_file_get(state, req, fd, &req->file, fixed);
@@ -5387,7 +5472,7 @@ static int io_grab_files(struct io_kiocb *req)
int ret = -EBADF;
struct io_ring_ctx *ctx = req->ctx;
- if (req->work.files)
+ if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
return 0;
if (!ctx->ring_file)
return -EBADF;
@@ -5476,7 +5561,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
struct io_kiocb *nxt;
- if (!(req->flags & REQ_F_LINK))
+ if (!(req->flags & REQ_F_LINK_HEAD))
return NULL;
/* for polled retry, if flag is set, we already went through here */
if (req->flags & REQ_F_POLLED)
@@ -5501,7 +5586,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
again:
linked_timeout = io_prep_linked_timeout(req);
- if (req->work.creds && req->work.creds != current_cred()) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+ req->work.creds != current_cred()) {
if (old_creds)
revert_creds(old_creds);
if (old_creds == req->work.creds)
@@ -5524,6 +5610,8 @@ again:
goto exit;
}
punt:
+ io_req_init_async(req);
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (ret)
@@ -5581,9 +5669,15 @@ fail_req:
io_double_put_req(req);
}
} else if (req->flags & REQ_F_FORCE_ASYNC) {
- ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret < 0))
- goto fail_req;
+ if (!req->io) {
+ ret = -EAGAIN;
+ if (io_alloc_async_ctx(req))
+ goto fail_req;
+ ret = io_req_defer_prep(req, sqe);
+ if (unlikely(ret < 0))
+ goto fail_req;
+ }
+
/*
* Never try inline submit of IOSQE_ASYNC is set, go straight
* to async execution.
@@ -5604,54 +5698,11 @@ static inline void io_queue_link_head(struct io_kiocb *req)
io_queue_sqe(req, NULL);
}
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
-
-static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_submit_state *state, struct io_kiocb **link)
+static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **link)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned int sqe_flags;
- int ret, id, fd;
-
- sqe_flags = READ_ONCE(sqe->flags);
-
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
- ret = -EINVAL;
- goto err_req;
- }
-
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select) {
- ret = -EOPNOTSUPP;
- goto err_req;
- }
-
- id = READ_ONCE(sqe->personality);
- if (id) {
- req->work.creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!req->work.creds)) {
- ret = -EINVAL;
- goto err_req;
- }
- get_cred(req->work.creds);
- }
-
- /* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
- IOSQE_ASYNC | IOSQE_FIXED_FILE |
- IOSQE_BUFFER_SELECT);
-
- fd = READ_ONCE(sqe->fd);
- ret = io_req_set_file(state, req, fd, sqe_flags);
- if (unlikely(ret)) {
-err_req:
- io_cqring_add_event(req, ret);
- io_double_put_req(req);
- return false;
- }
+ int ret;
/*
* If we already have a head request, queue this one for async
@@ -5670,42 +5721,39 @@ err_req:
* next after the link request. The last one is done via
* drain_next flag to persist the effect across calls.
*/
- if (sqe_flags & IOSQE_IO_DRAIN) {
+ if (req->flags & REQ_F_IO_DRAIN) {
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
- if (io_alloc_async_ctx(req)) {
- ret = -EAGAIN;
- goto err_req;
- }
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
ret = io_req_defer_prep(req, sqe);
if (ret) {
/* fail even hard links since we don't submit */
head->flags |= REQ_F_FAIL_LINK;
- goto err_req;
+ return ret;
}
trace_io_uring_link(ctx, req, head);
list_add_tail(&req->link_list, &head->link_list);
/* last request of a link, enqueue the link */
- if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
+ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_link_head(head);
*link = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
- req->ctx->drain_next = 0;
+ ctx->drain_next = 0;
}
- if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
- req->flags |= REQ_F_LINK;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ req->flags |= REQ_F_LINK_HEAD;
INIT_LIST_HEAD(&req->link_list);
- if (io_alloc_async_ctx(req)) {
- ret = -EAGAIN;
- goto err_req;
- }
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
+
ret = io_req_defer_prep(req, sqe);
if (ret)
req->flags |= REQ_F_FAIL_LINK;
@@ -5715,7 +5763,7 @@ err_req:
}
}
- return true;
+ return 0;
}
/*
@@ -5724,7 +5772,7 @@ err_req:
static void io_submit_state_end(struct io_submit_state *state)
{
blk_finish_plug(&state->plug);
- io_file_put(state);
+ io_state_file_put(state);
if (state->free_reqs)
kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
}
@@ -5789,15 +5837,23 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx)
ctx->cached_sq_head++;
}
-static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+ IOSQE_BUFFER_SELECT)
+
+static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe,
+ struct io_submit_state *state)
{
+ unsigned int sqe_flags;
+ int id;
+
/*
* All io need record the previous position, if LINK vs DARIN,
* it can be used to mark the position of the first IO in the
* link list.
*/
- req->sequence = ctx->cached_sq_head;
+ req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
req->opcode = READ_ONCE(sqe->opcode);
req->user_data = READ_ONCE(sqe->user_data);
req->io = NULL;
@@ -5806,19 +5862,48 @@ static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->flags = 0;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
- req->task = NULL;
+ req->task = current;
req->result = 0;
- INIT_IO_WORK(&req->work, io_wq_submit_work);
+
+ if (unlikely(req->opcode >= IORING_OP_LAST))
+ return -EINVAL;
+
+ if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ return -EFAULT;
+
+ sqe_flags = READ_ONCE(sqe->flags);
+ /* enforce forwards compatibility on users */
+ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
+ return -EINVAL;
+
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[req->opcode].buffer_select)
+ return -EOPNOTSUPP;
+
+ id = READ_ONCE(sqe->personality);
+ if (id) {
+ io_req_init_async(req);
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds))
+ return -EINVAL;
+ get_cred(req->work.creds);
+ }
+
+ /* same numerical values with corresponding REQ_F_*, safe to copy */
+ req->flags |= sqe_flags;
+
+ if (!io_op_defs[req->opcode].needs_file)
+ return 0;
+
+ return io_req_set_file(state, req, READ_ONCE(sqe->fd));
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
- struct file *ring_file, int ring_fd,
- struct mm_struct **mm, bool async)
+ struct file *ring_file, int ring_fd)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
int i, submitted = 0;
- bool mm_fault = false;
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
@@ -5858,34 +5943,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
break;
}
- io_init_req(ctx, req, sqe);
+ err = io_init_req(ctx, req, sqe, statep);
io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
- if (unlikely(req->opcode >= IORING_OP_LAST)) {
- err = -EINVAL;
+ if (unlikely(err)) {
fail_req:
io_cqring_add_event(req, err);
io_double_put_req(req);
break;
}
- if (io_op_defs[req->opcode].needs_mm && !*mm) {
- mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
- if (unlikely(mm_fault)) {
- err = -EFAULT;
- goto fail_req;
- }
- use_mm(ctx->sqo_mm);
- *mm = ctx->sqo_mm;
- }
-
- req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
- true, async);
- if (!io_submit_sqe(req, sqe, statep, &link))
- break;
+ true, io_async_submit(ctx));
+ err = io_submit_sqe(req, sqe, &link);
+ if (err)
+ goto fail_req;
}
if (unlikely(submitted != nr)) {
@@ -5907,17 +5981,13 @@ fail_req:
static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
- struct mm_struct *cur_mm = NULL;
const struct cred *old_cred;
- mm_segment_t old_fs;
DEFINE_WAIT(wait);
unsigned long timeout;
int ret = 0;
- complete(&ctx->completions[1]);
+ complete(&ctx->sq_thread_comp);
- old_fs = get_fs();
- set_fs(USER_DS);
old_cred = override_creds(ctx->creds);
timeout = jiffies + ctx->sq_thread_idle;
@@ -5948,11 +6018,7 @@ static int io_sq_thread(void *data)
* adding ourselves to the waitqueue, as the unuse/drop
* may sleep.
*/
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- cur_mm = NULL;
- }
+ io_sq_thread_drop_mm(ctx);
/*
* We're polling. If we're within the defined idle
@@ -6008,6 +6074,7 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ ret = 0;
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
@@ -6016,7 +6083,8 @@ static int io_sq_thread(void *data)
}
mutex_lock(&ctx->uring_lock);
- ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+ if (likely(!percpu_ref_is_dying(&ctx->refs)))
+ ret = io_submit_sqes(ctx, to_submit, NULL, -1);
mutex_unlock(&ctx->uring_lock);
timeout = jiffies + ctx->sq_thread_idle;
}
@@ -6024,11 +6092,7 @@ static int io_sq_thread(void *data)
if (current->task_works)
task_work_run();
- set_fs(old_fs);
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- }
+ io_sq_thread_drop_mm(ctx);
revert_creds(old_cred);
kthread_parkme();
@@ -6167,22 +6231,22 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
struct fixed_file_data *data = ctx->file_data;
struct fixed_file_ref_node *ref_node = NULL;
unsigned nr_tables, i;
- unsigned long flags;
if (!data)
return -ENXIO;
- spin_lock_irqsave(&data->lock, flags);
+ spin_lock(&data->lock);
if (!list_empty(&data->ref_list))
ref_node = list_first_entry(&data->ref_list,
struct fixed_file_ref_node, node);
- spin_unlock_irqrestore(&data->lock, flags);
+ spin_unlock(&data->lock);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
percpu_ref_kill(&data->refs);
/* wait for all refs nodes to complete */
+ flush_delayed_work(&ctx->file_put_work);
wait_for_completion(&data->done);
__io_sqe_files_unregister(ctx);
@@ -6200,7 +6264,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
static void io_sq_thread_stop(struct io_ring_ctx *ctx)
{
if (ctx->sqo_thread) {
- wait_for_completion(&ctx->completions[1]);
+ wait_for_completion(&ctx->sq_thread_comp);
/*
* The park is a bit of a work-around, without it we get
* warning spews on shutdown with SQPOLL set and affinity
@@ -6413,40 +6477,63 @@ struct io_file_put {
struct file *file;
};
-static void io_file_put_work(struct work_struct *work)
+static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
{
- struct fixed_file_ref_node *ref_node;
- struct fixed_file_data *file_data;
- struct io_ring_ctx *ctx;
+ struct fixed_file_data *file_data = ref_node->file_data;
+ struct io_ring_ctx *ctx = file_data->ctx;
struct io_file_put *pfile, *tmp;
- unsigned long flags;
-
- ref_node = container_of(work, struct fixed_file_ref_node, work);
- file_data = ref_node->file_data;
- ctx = file_data->ctx;
list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
- list_del_init(&pfile->list);
+ list_del(&pfile->list);
io_ring_file_put(ctx, pfile->file);
kfree(pfile);
}
- spin_lock_irqsave(&file_data->lock, flags);
- list_del_init(&ref_node->node);
- spin_unlock_irqrestore(&file_data->lock, flags);
+ spin_lock(&file_data->lock);
+ list_del(&ref_node->node);
+ spin_unlock(&file_data->lock);
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
percpu_ref_put(&file_data->refs);
}
+static void io_file_put_work(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx;
+ struct llist_node *node;
+
+ ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
+ node = llist_del_all(&ctx->file_put_llist);
+
+ while (node) {
+ struct fixed_file_ref_node *ref_node;
+ struct llist_node *next = node->next;
+
+ ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
+ __io_file_put_work(ref_node);
+ node = next;
+ }
+}
+
static void io_file_data_ref_zero(struct percpu_ref *ref)
{
struct fixed_file_ref_node *ref_node;
+ struct io_ring_ctx *ctx;
+ bool first_add;
+ int delay = HZ;
ref_node = container_of(ref, struct fixed_file_ref_node, refs);
+ ctx = ref_node->file_data->ctx;
+
+ if (percpu_ref_is_dying(&ctx->file_data->refs))
+ delay = 0;
- queue_work(system_wq, &ref_node->work);
+ first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
+ if (!delay)
+ mod_delayed_work(system_wq, &ctx->file_put_work, 0);
+ else if (first_add)
+ queue_delayed_work(system_wq, &ctx->file_put_work, delay);
}
static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
@@ -6465,10 +6552,8 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
}
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->file_list);
- INIT_WORK(&ref_node->work, io_file_put_work);
ref_node->file_data = ctx->file_data;
return ref_node;
-
}
static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
@@ -6486,7 +6571,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
int fd, ret = 0;
unsigned i;
struct fixed_file_ref_node *ref_node;
- unsigned long flags;
if (ctx->file_data)
return -EBUSY;
@@ -6594,9 +6678,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
ctx->file_data->cur_refs = &ref_node->refs;
- spin_lock_irqsave(&ctx->file_data->lock, flags);
+ spin_lock(&ctx->file_data->lock);
list_add(&ref_node->node, &ctx->file_data->ref_list);
- spin_unlock_irqrestore(&ctx->file_data->lock, flags);
+ spin_unlock(&ctx->file_data->lock);
percpu_ref_get(&ctx->file_data->refs);
return ret;
}
@@ -6672,7 +6756,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
__s32 __user *fds;
int fd, i, err;
__u32 done;
- unsigned long flags;
bool needs_switch = false;
if (check_add_overflow(up->offset, nr_args, &done))
@@ -6737,10 +6820,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (needs_switch) {
percpu_ref_kill(data->cur_refs);
- spin_lock_irqsave(&data->lock, flags);
+ spin_lock(&data->lock);
list_add(&ref_node->node, &data->ref_list);
data->cur_refs = &ref_node->refs;
- spin_unlock_irqrestore(&data->lock, flags);
+ spin_unlock(&data->lock);
percpu_ref_get(&ctx->file_data->refs);
} else
destroy_fixed_file_ref_node(ref_node);
@@ -6784,6 +6867,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
data.user = ctx->user;
data.free_work = io_free_work;
+ data.do_work = io_wq_submit_work;
if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
/* Do QD, or 4 * CPUS, whatever is smallest */
@@ -6824,7 +6908,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
{
int ret;
- init_waitqueue_head(&ctx->sqo_wait);
mmgrab(current->mm);
ctx->sqo_mm = current->mm;
@@ -7066,8 +7149,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
ret = 0;
if (!pages || nr_pages > got_pages) {
- kfree(vmas);
- kfree(pages);
+ kvfree(vmas);
+ kvfree(pages);
pages = kvmalloc_array(nr_pages, sizeof(struct page *),
GFP_KERNEL);
vmas = kvmalloc_array(nr_pages,
@@ -7092,7 +7175,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
}
ret = 0;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
pret = pin_user_pages(ubuf, nr_pages,
FOLL_WRITE | FOLL_LONGTERM,
pages, vmas);
@@ -7110,7 +7193,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
} else {
ret = pret < 0 ? pret : -EFAULT;
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (ret) {
/*
* if we did partial map, or found file backed vmas,
@@ -7229,7 +7312,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
put_cred(ctx->creds);
- kfree(ctx->completions);
kfree(ctx->cancel_hash);
kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx);
@@ -7281,7 +7363,17 @@ static void io_ring_exit_work(struct work_struct *work)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
- wait_for_completion(&ctx->completions[0]);
+ /*
+ * If we're doing polled IO and end up having requests being
+ * submitted async (out-of-line), then completions can come in while
+ * we're waiting for refs to drop. We need to reap these manually,
+ * as nobody else will be looking for them.
+ */
+ while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
+ io_iopoll_reap_events(ctx);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+ }
io_ring_ctx_free(ctx);
}
@@ -7291,16 +7383,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
percpu_ref_kill(&ctx->refs);
mutex_unlock(&ctx->uring_lock);
- /*
- * Wait for sq thread to idle, if we have one. It won't spin on new
- * work after we've killed the ctx ref above. This is important to do
- * before we cancel existing commands, as the thread could otherwise
- * be queueing new work post that. If that's work we need to cancel,
- * it could cause shutdown to hang.
- */
- while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
- cpu_relax();
-
io_kill_timeouts(ctx);
io_poll_remove_all(ctx);
@@ -7325,14 +7407,25 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
+static bool io_wq_files_match(struct io_wq_work *work, void *data)
+{
+ struct files_struct *files = data;
+
+ return work->files == files;
+}
+
static void io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
- struct io_kiocb *req;
- DEFINE_WAIT(wait);
+ if (list_empty_careful(&ctx->inflight_list))
+ return;
+
+ /* cancel all at once, should be faster than doing it one by one*/
+ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
while (!list_empty_careful(&ctx->inflight_list)) {
- struct io_kiocb *cancel_req = NULL;
+ struct io_kiocb *cancel_req = NULL, *req;
+ DEFINE_WAIT(wait);
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
@@ -7371,16 +7464,26 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
* all we had, then we're done with this request.
*/
if (refcount_sub_and_test(2, &cancel_req->refs)) {
- io_put_req(cancel_req);
+ io_free_req(cancel_req);
+ finish_wait(&ctx->inflight_wait, &wait);
continue;
}
+ } else {
+ io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
+ io_put_req(cancel_req);
}
- io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
- io_put_req(cancel_req);
schedule();
+ finish_wait(&ctx->inflight_wait, &wait);
}
- finish_wait(&ctx->inflight_wait, &wait);
+}
+
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct task_struct *task = data;
+
+ return req->task == task;
}
static int io_uring_flush(struct file *file, void *data)
@@ -7393,7 +7496,7 @@ static int io_uring_flush(struct file *file, void *data)
* If the task is going away, cancel work it may have pending
*/
if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
return 0;
}
@@ -7509,13 +7612,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
wake_up(&ctx->sqo_wait);
submitted = to_submit;
} else if (to_submit) {
- struct mm_struct *cur_mm;
-
mutex_lock(&ctx->uring_lock);
- /* already have mm, so io_submit_sqes() won't try to grab it */
- cur_mm = ctx->sqo_mm;
- submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
- &cur_mm, false);
+ submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
mutex_unlock(&ctx->uring_lock);
if (submitted != to_submit)
@@ -7734,7 +7832,8 @@ err:
return ret;
}
-static int io_uring_create(unsigned entries, struct io_uring_params *p)
+static int io_uring_create(unsigned entries, struct io_uring_params *p,
+ struct io_uring_params __user *params)
{
struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
@@ -7825,7 +7924,16 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
p->cq_off.cqes = offsetof(struct io_rings, cqes);
+ p->cq_off.flags = offsetof(struct io_rings, cq_flags);
+ p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
+ IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
+ IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
+
+ if (copy_to_user(params, p, sizeof(*p))) {
+ ret = -EFAULT;
+ goto err;
+ }
/*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
@@ -7834,9 +7942,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
if (ret < 0)
goto err;
- p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
- IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
@@ -7852,7 +7957,6 @@ err:
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
struct io_uring_params p;
- long ret;
int i;
if (copy_from_user(&p, params, sizeof(p)))
@@ -7867,14 +7971,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
return -EINVAL;
- ret = io_uring_create(entries, &p);
- if (ret < 0)
- return ret;
-
- if (copy_to_user(params, &p, sizeof(p)))
- return -EFAULT;
-
- return ret;
+ return io_uring_create(entries, &p, params);
}
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
@@ -7988,7 +8085,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
* after we've killed the percpu ref.
*/
mutex_unlock(&ctx->uring_lock);
- ret = wait_for_completion_interruptible(&ctx->completions[0]);
+ ret = wait_for_completion_interruptible(&ctx->ref_comp);
mutex_lock(&ctx->uring_lock);
if (ret) {
percpu_ref_resurrect(&ctx->refs);
@@ -8065,7 +8162,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
out:
- reinit_completion(&ctx->completions[0]);
+ reinit_completion(&ctx->ref_comp);
}
return ret;
}