summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/btrfs/file-item.c15
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/btrfs/ordered-data.c3
-rw-r--r--fs/ceph/dir.c6
-rw-r--r--fs/ceph/inode.c16
-rw-r--r--fs/ceph/mds_client.c70
-rw-r--r--fs/ceph/snap.c7
-rw-r--r--fs/cifs/file.c15
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/misc.c23
-rw-r--r--fs/cifs/smb2pdu.c1
-rw-r--r--fs/io_uring.c291
-rw-r--r--fs/notify/fanotify/fanotify.c14
-rw-r--r--fs/notify/mark.c12
-rw-r--r--fs/proc/proc_sysctl.c6
-rw-r--r--fs/splice.c4
17 files changed, 337 insertions, 155 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 24615c76c1d0..bb28e2ead679 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -264,7 +264,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio_for_each_segment_all(bvec, &bio, i, iter_all) {
if (should_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
- put_page(bvec->bv_page);
+ if (!bio_flagged(&bio, BIO_NO_PAGE_REF))
+ put_page(bvec->bv_page);
}
if (unlikely(bio.bi_status))
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 920bf3b4b0ef..cccc75d15970 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -427,9 +428,13 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
unsigned long this_sum_bytes = 0;
int i;
u64 offset;
+ unsigned nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
+ sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
+ GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
- sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
- GFP_NOFS);
if (!sums)
return BLK_STS_RESOURCE;
@@ -472,8 +477,10 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
bytes_left = bio->bi_iter.bi_size - total_bytes;
- sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left),
- GFP_NOFS);
+ nofs_flag = memalloc_nofs_save();
+ sums = kvzalloc(btrfs_ordered_sum_size(fs_info,
+ bytes_left), GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
BUG_ON(!sums); /* -ENOMEM */
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 82fdda8ff5ab..2973608824ec 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6783,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
- u8 extent_type;
+ int extent_type = -1;
struct btrfs_path *path = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *item;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6fde2b2741ef..45e3cfd1198b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
@@ -442,7 +443,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
list_del(&sum->list);
- kfree(sum);
+ kvfree(sum);
}
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a8f429882249..0637149fb9f9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1766,6 +1766,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
{
struct ceph_inode_info *dci = ceph_inode(dir);
+ unsigned hash;
switch (dci->i_dir_layout.dl_dir_hash) {
case 0: /* for backward compat */
@@ -1773,8 +1774,11 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
return dn->d_name.hash;
default:
- return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+ spin_lock(&dn->d_lock);
+ hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
dn->d_name.name, dn->d_name.len);
+ spin_unlock(&dn->d_lock);
+ return hash;
}
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2d61ddda9bf5..c2feb310ac1e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1163,6 +1163,19 @@ static int splice_dentry(struct dentry **pdn, struct inode *in)
return 0;
}
+static int d_name_cmp(struct dentry *dentry, const char *name, size_t len)
+{
+ int ret;
+
+ /* take d_lock to ensure dentry->d_name stability */
+ spin_lock(&dentry->d_lock);
+ ret = dentry->d_name.len - len;
+ if (!ret)
+ ret = memcmp(dentry->d_name.name, name, len);
+ spin_unlock(&dentry->d_lock);
+ return ret;
+}
+
/*
* Incorporate results into the local cache. This is either just
* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
@@ -1412,7 +1425,8 @@ retry_lookup:
err = splice_dentry(&req->r_dentry, in);
if (err < 0)
goto done;
- } else if (rinfo->head->is_dentry) {
+ } else if (rinfo->head->is_dentry &&
+ !d_name_cmp(req->r_dentry, rinfo->dname, rinfo->dname_len)) {
struct ceph_vino *ptvino = NULL;
if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) ||
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 21c33ed048ed..9049c2a3e972 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1414,6 +1414,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
ci->i_prealloc_cap_flush = NULL;
}
+
+ if (drop &&
+ ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
}
spin_unlock(&ci->i_ceph_lock);
while (!list_empty(&to_remove)) {
@@ -2161,10 +2170,39 @@ retry:
return path;
}
+/* Duplicate the dentry->d_name.name safely */
+static int clone_dentry_name(struct dentry *dentry, const char **ppath,
+ int *ppathlen)
+{
+ u32 len;
+ char *name;
+
+retry:
+ len = READ_ONCE(dentry->d_name.len);
+ name = kmalloc(len + 1, GFP_NOFS);
+ if (!name)
+ return -ENOMEM;
+
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_name.len != len) {
+ spin_unlock(&dentry->d_lock);
+ kfree(name);
+ goto retry;
+ }
+ memcpy(name, dentry->d_name.name, len);
+ spin_unlock(&dentry->d_lock);
+
+ name[len] = '\0';
+ *ppath = name;
+ *ppathlen = len;
+ return 0;
+}
+
static int build_dentry_path(struct dentry *dentry, struct inode *dir,
const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
+ bool *pfreepath, bool parent_locked)
{
+ int ret;
char *path;
rcu_read_lock();
@@ -2173,8 +2211,15 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
*pino = ceph_ino(dir);
rcu_read_unlock();
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
+ if (parent_locked) {
+ *ppath = dentry->d_name.name;
+ *ppathlen = dentry->d_name.len;
+ } else {
+ ret = clone_dentry_name(dentry, ppath, ppathlen);
+ if (ret)
+ return ret;
+ *pfreepath = true;
+ }
return 0;
}
rcu_read_unlock();
@@ -2182,13 +2227,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
if (IS_ERR(path))
return PTR_ERR(path);
*ppath = path;
- *pfreepath = 1;
+ *pfreepath = true;
return 0;
}
static int build_inode_path(struct inode *inode,
const char **ppath, int *ppathlen, u64 *pino,
- int *pfreepath)
+ bool *pfreepath)
{
struct dentry *dentry;
char *path;
@@ -2204,7 +2249,7 @@ static int build_inode_path(struct inode *inode,
if (IS_ERR(path))
return PTR_ERR(path);
*ppath = path;
- *pfreepath = 1;
+ *pfreepath = true;
return 0;
}
@@ -2215,7 +2260,7 @@ static int build_inode_path(struct inode *inode,
static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
struct inode *rdiri, const char *rpath,
u64 rino, const char **ppath, int *pathlen,
- u64 *ino, int *freepath)
+ u64 *ino, bool *freepath, bool parent_locked)
{
int r = 0;
@@ -2225,7 +2270,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
ceph_snap(rinode));
} else if (rdentry) {
r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
- freepath);
+ freepath, parent_locked);
dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
*ppath);
} else if (rpath || rino) {
@@ -2251,7 +2296,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
const char *path2 = NULL;
u64 ino1 = 0, ino2 = 0;
int pathlen1 = 0, pathlen2 = 0;
- int freepath1 = 0, freepath2 = 0;
+ bool freepath1 = false, freepath2 = false;
int len;
u16 releases;
void *p, *end;
@@ -2259,16 +2304,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
ret = set_request_path_attr(req->r_inode, req->r_dentry,
req->r_parent, req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1);
+ &path1, &pathlen1, &ino1, &freepath1,
+ test_bit(CEPH_MDS_R_PARENT_LOCKED,
+ &req->r_req_flags));
if (ret < 0) {
msg = ERR_PTR(ret);
goto out;
}
+ /* If r_old_dentry is set, then assume that its parent is locked */
ret = set_request_path_attr(NULL, req->r_old_dentry,
req->r_old_dentry_dir,
req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2);
+ &path2, &pathlen2, &ino2, &freepath2, true);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out_free1;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 89aa37fa0f84..b26e12cd8ec3 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -572,7 +572,12 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
old_snapc = NULL;
update_snapc:
- if (ci->i_head_snapc) {
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ci->i_head_snapc = NULL;
+ } else {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9c0ccc06d172..7037a137fa53 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2877,7 +2877,6 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
struct dentry *dentry = ctx->cfile->dentry;
- unsigned int i;
int rc;
tcon = tlink_tcon(ctx->cfile->tlink);
@@ -2941,10 +2940,6 @@ restart_loop:
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
- if (!ctx->direct_io)
- for (i = 0; i < ctx->npages; i++)
- put_page(ctx->bv[i].bv_page);
-
cifs_stats_bytes_written(tcon, ctx->total_len);
set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
@@ -3582,7 +3577,6 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx)
struct iov_iter *to = &ctx->iter;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
- unsigned int i;
int rc;
tcon = tlink_tcon(ctx->cfile->tlink);
@@ -3666,15 +3660,8 @@ again:
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
- if (!ctx->direct_io) {
- for (i = 0; i < ctx->npages; i++) {
- if (ctx->should_dirty)
- set_page_dirty(ctx->bv[i].bv_page);
- put_page(ctx->bv[i].bv_page);
- }
-
+ if (!ctx->direct_io)
ctx->total_len = ctx->len - iov_iter_count(to);
- }
/* mask nodata case */
if (rc == -ENODATA)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 53fdb5df0d2e..538fd7d807e4 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1735,6 +1735,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
if (rc == 0 || rc != -EBUSY)
goto do_rename_exit;
+ /* Don't fall back to using SMB on SMB 2+ mount */
+ if (server->vals->protocol_id != 0)
+ goto do_rename_exit;
+
/* open-file renames don't work across directories */
if (to_dentry->d_parent != from_dentry->d_parent)
goto do_rename_exit;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1e1626a2cfc3..0dc6f08020ac 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -789,6 +789,11 @@ cifs_aio_ctx_alloc(void)
{
struct cifs_aio_ctx *ctx;
+ /*
+ * Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io
+ * to false so that we know when we have to unreference pages within
+ * cifs_aio_ctx_release()
+ */
ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL);
if (!ctx)
return NULL;
@@ -807,7 +812,23 @@ cifs_aio_ctx_release(struct kref *refcount)
struct cifs_aio_ctx, refcount);
cifsFileInfo_put(ctx->cfile);
- kvfree(ctx->bv);
+
+ /*
+ * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly
+ * which means that iov_iter_get_pages() was a success and thus that
+ * we have taken reference on pages.
+ */
+ if (ctx->bv) {
+ unsigned i;
+
+ for (i = 0; i < ctx->npages; i++) {
+ if (ctx->should_dirty)
+ set_page_dirty(ctx->bv[i].bv_page);
+ put_page(ctx->bv[i].bv_page);
+ }
+ kvfree(ctx->bv);
+ }
+
kfree(ctx);
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index b8f7262ac354..a37774a55f3a 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3466,6 +3466,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
io_parms->tcon->tid, ses->Suid,
io_parms->offset, 0);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+ cifs_small_buf_release(req);
return rc == -ENODATA ? 0 : rc;
} else
trace_smb3_read_done(xid, req->PersistentFileId,
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f65f85d89217..84efb8956734 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4,15 +4,28 @@
* supporting fast/efficient IO.
*
* A note on the read/write ordering memory barriers that are matched between
- * the application and kernel side. When the application reads the CQ ring
- * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
- * the kernel uses after writing the tail. Failure to do so could cause a
- * delay in when the application notices that completion events available.
- * This isn't a fatal condition. Likewise, the application must use an
- * appropriate smp_wmb() both before writing the SQ tail, and after writing
- * the SQ tail. The first one orders the sqe writes with the tail write, and
- * the latter is paired with the smp_rmb() the kernel will issue before
- * reading the SQ tail on submission.
+ * the application and kernel side.
+ *
+ * After the application reads the CQ ring tail, it must use an
+ * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
+ * before writing the tail (using smp_load_acquire to read the tail will
+ * do). It also needs a smp_mb() before updating CQ head (ordering the
+ * entry load(s) with the head store), pairing with an implicit barrier
+ * through a control-dependency in io_get_cqring (smp_store_release to
+ * store head will do). Failure to do so could lead to reading invalid
+ * CQ entries.
+ *
+ * Likewise, the application must use an appropriate smp_wmb() before
+ * writing the SQ tail (ordering SQ entry stores with the tail store),
+ * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
+ * to store the tail will do). And it needs a barrier ordering the SQ
+ * head load before writing new SQ entries (smp_load_acquire to read
+ * head will do).
+ *
+ * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
+ * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
+ * updating the SQ tail; a full memory barrier smp_mb() is needed
+ * between.
*
* Also see the examples in the liburing library:
*
@@ -70,20 +83,108 @@ struct io_uring {
u32 tail ____cacheline_aligned_in_smp;
};
+/*
+ * This data is shared with the application through the mmap at offset
+ * IORING_OFF_SQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_sqring_offsets when calling io_uring_setup.
+ */
struct io_sq_ring {
+ /*
+ * Head and tail offsets into the ring; the offsets need to be
+ * masked to get valid indices.
+ *
+ * The kernel controls head and the application controls tail.
+ */
struct io_uring r;
+ /*
+ * Bitmask to apply to head and tail offsets (constant, equals
+ * ring_entries - 1)
+ */
u32 ring_mask;
+ /* Ring size (constant, power of 2) */
u32 ring_entries;
+ /*
+ * Number of invalid entries dropped by the kernel due to
+ * invalid index stored in array
+ *
+ * Written by the kernel, shouldn't be modified by the
+ * application (i.e. get number of "new events" by comparing to
+ * cached value).
+ *
+ * After a new SQ head value was read by the application this
+ * counter includes all submissions that were dropped reaching
+ * the new SQ head (and possibly more).
+ */
u32 dropped;
+ /*
+ * Runtime flags
+ *
+ * Written by the kernel, shouldn't be modified by the
+ * application.
+ *
+ * The application needs a full memory barrier before checking
+ * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
+ */
u32 flags;
+ /*
+ * Ring buffer of indices into array of io_uring_sqe, which is
+ * mmapped by the application using the IORING_OFF_SQES offset.
+ *
+ * This indirection could e.g. be used to assign fixed
+ * io_uring_sqe entries to operations and only submit them to
+ * the queue when needed.
+ *
+ * The kernel modifies neither the indices array nor the entries
+ * array.
+ */
u32 array[];
};
+/*
+ * This data is shared with the application through the mmap at offset
+ * IORING_OFF_CQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_cqring_offsets when calling io_uring_setup.
+ */
struct io_cq_ring {
+ /*
+ * Head and tail offsets into the ring; the offsets need to be
+ * masked to get valid indices.
+ *
+ * The application controls head and the kernel tail.
+ */
struct io_uring r;
+ /*
+ * Bitmask to apply to head and tail offsets (constant, equals
+ * ring_entries - 1)
+ */
u32 ring_mask;
+ /* Ring size (constant, power of 2) */
u32 ring_entries;
+ /*
+ * Number of completion events lost because the queue was full;
+ * this should be avoided by the application by making sure
+ * there are not more requests pending thatn there is space in
+ * the completion queue.
+ *
+ * Written by the kernel, shouldn't be modified by the
+ * application (i.e. get number of "new events" by comparing to
+ * cached value).
+ *
+ * As completion events come in out of order this counter is not
+ * ordered with any other data.
+ */
u32 overflow;
+ /*
+ * Ring buffer of completion events.
+ *
+ * The kernel writes completion events fresh every time they are
+ * produced, so the application is allowed to modify pending
+ * entries.
+ */
struct io_uring_cqe cqes[];
};
@@ -221,7 +322,7 @@ struct io_kiocb {
struct list_head list;
unsigned int flags;
refcount_t refs;
-#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
+#define REQ_F_NOWAIT 1 /* must not punt to workers */
#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
#define REQ_F_SEQ_PREV 8 /* sequential with previous */
@@ -317,12 +418,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
/* order cqe stores with ring update */
smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
- /*
- * Write sider barrier of tail update, app has read side. See
- * comment at the top of this file.
- */
- smp_wmb();
-
if (wq_has_sleeper(&ctx->cq_wait)) {
wake_up_interruptible(&ctx->cq_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
@@ -336,8 +431,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
unsigned tail;
tail = ctx->cached_cq_tail;
- /* See comment at the top of the file */
- smp_rmb();
+ /*
+ * writes to the cq entry need to come after reading head; the
+ * control dependency is enough as we're using WRITE_ONCE to
+ * fill the cq entry
+ */
if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
return NULL;
@@ -740,7 +838,7 @@ static bool io_file_supports_async(struct file *file)
}
static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock, struct io_submit_state *state)
+ bool force_nonblock)
{
const struct io_uring_sqe *sqe = s->sqe;
struct io_ring_ctx *ctx = req->ctx;
@@ -774,10 +872,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
if (unlikely(ret))
return ret;
- if (force_nonblock) {
+
+ /* don't allow async punt if RWF_NOWAIT was requested */
+ if (kiocb->ki_flags & IOCB_NOWAIT)
+ req->flags |= REQ_F_NOWAIT;
+
+ if (force_nonblock)
kiocb->ki_flags |= IOCB_NOWAIT;
- req->flags |= REQ_F_FORCE_NONBLOCK;
- }
+
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!(kiocb->ki_flags & IOCB_DIRECT) ||
!kiocb->ki_filp->f_op->iopoll)
@@ -938,7 +1040,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
}
static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock, struct io_submit_state *state)
+ bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw;
@@ -947,7 +1049,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
size_t iov_count;
int ret;
- ret = io_prep_rw(req, s, force_nonblock, state);
+ ret = io_prep_rw(req, s, force_nonblock);
if (ret)
return ret;
file = kiocb->ki_filp;
@@ -985,7 +1087,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
}
static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock, struct io_submit_state *state)
+ bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw;
@@ -994,7 +1096,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
size_t iov_count;
int ret;
- ret = io_prep_rw(req, s, force_nonblock, state);
+ ret = io_prep_rw(req, s, force_nonblock);
if (ret)
return ret;
@@ -1336,8 +1438,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct sqe_submit *s, bool force_nonblock,
- struct io_submit_state *state)
+ const struct sqe_submit *s, bool force_nonblock)
{
int ret, opcode;
@@ -1353,18 +1454,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
case IORING_OP_READV:
if (unlikely(s->sqe->buf_index))
return -EINVAL;
- ret = io_read(req, s, force_nonblock, state);
+ ret = io_read(req, s, force_nonblock);
break;
case IORING_OP_WRITEV:
if (unlikely(s->sqe->buf_index))
return -EINVAL;
- ret = io_write(req, s, force_nonblock, state);
+ ret = io_write(req, s, force_nonblock);
break;
case IORING_OP_READ_FIXED:
- ret = io_read(req, s, force_nonblock, state);
+ ret = io_read(req, s, force_nonblock);
break;
case IORING_OP_WRITE_FIXED:
- ret = io_write(req, s, force_nonblock, state);
+ ret = io_write(req, s, force_nonblock);
break;
case IORING_OP_FSYNC:
ret = io_fsync(req, s->sqe, force_nonblock);
@@ -1437,8 +1538,7 @@ restart:
struct sqe_submit *s = &req->submit;
const struct io_uring_sqe *sqe = s->sqe;
- /* Ensure we clear previously set forced non-block flag */
- req->flags &= ~REQ_F_FORCE_NONBLOCK;
+ /* Ensure we clear previously set non-block flag */
req->rw.ki_flags &= ~IOCB_NOWAIT;
ret = 0;
@@ -1457,7 +1557,7 @@ restart:
s->has_user = cur_mm != NULL;
s->needs_lock = true;
do {
- ret = __io_submit_sqe(ctx, req, s, false, NULL);
+ ret = __io_submit_sqe(ctx, req, s, false);
/*
* We can get EAGAIN for polled IO even though
* we're forcing a sync submission from here,
@@ -1468,10 +1568,11 @@ restart:
break;
cond_resched();
} while (1);
-
- /* drop submission reference */
- io_put_req(req);
}
+
+ /* drop submission reference */
+ io_put_req(req);
+
if (ret) {
io_cqring_add_event(ctx, sqe->user_data, ret, 0);
io_put_req(req);
@@ -1623,8 +1724,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
if (unlikely(ret))
goto out;
- ret = __io_submit_sqe(ctx, req, s, true, state);
- if (ret == -EAGAIN) {
+ ret = __io_submit_sqe(ctx, req, s, true);
+ if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
struct io_uring_sqe *sqe_copy;
sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
@@ -1698,24 +1799,10 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* write new data to them.
*/
smp_store_release(&ring->r.head, ctx->cached_sq_head);
-
- /*
- * write side barrier of head update, app has read side. See
- * comment at the top of this file
- */
- smp_wmb();
}
}
/*
- * Undo last io_get_sqring()
- */
-static void io_drop_sqring(struct io_ring_ctx *ctx)
-{
- ctx->cached_sq_head--;
-}
-
-/*
* Fetch an sqe, if one is available. Note that s->sqe will point to memory
* that is mapped by userspace. This means that care needs to be taken to
* ensure that reads are stable, as we cannot rely on userspace always
@@ -1737,9 +1824,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
* though the application is the one updating it.
*/
head = ctx->cached_sq_head;
- /* See comment at the top of this file */
- smp_rmb();
- if (head == READ_ONCE(ring->r.tail))
+ /* make sure SQ entry isn't read before tail */
+ if (head == smp_load_acquire(&ring->r.tail))
return false;
head = READ_ONCE(ring->array[head & ctx->sq_mask]);
@@ -1753,8 +1839,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
/* drop invalid entries */
ctx->cached_sq_head++;
ring->dropped++;
- /* See comment at the top of this file */
- smp_wmb();
return false;
}
@@ -1864,7 +1948,8 @@ static int io_sq_thread(void *data)
/* Tell userspace we may need a wakeup call */
ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
- smp_wmb();
+ /* make sure to read SQ tail after writing flags */
+ smp_mb();
if (!io_get_sqring(ctx, &sqes[0])) {
if (kthread_should_stop()) {
@@ -1877,13 +1962,11 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
- smp_wmb();
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
- smp_wmb();
}
i = 0;
@@ -1928,7 +2011,7 @@ static int io_sq_thread(void *data)
static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
{
struct io_submit_state state, *statep = NULL;
- int i, ret = 0, submit = 0;
+ int i, submit = 0;
if (to_submit > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, ctx, to_submit);
@@ -1937,6 +2020,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
for (i = 0; i < to_submit; i++) {
struct sqe_submit s;
+ int ret;
if (!io_get_sqring(ctx, &s))
break;
@@ -1944,21 +2028,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
s.has_user = true;
s.needs_lock = false;
s.needs_fixed_file = false;
+ submit++;
ret = io_submit_sqe(ctx, &s, statep);
- if (ret) {
- io_drop_sqring(ctx);
- break;
- }
-
- submit++;
+ if (ret)
+ io_cqring_add_event(ctx, s.sqe->user_data, ret, 0);
}
io_commit_sqring(ctx);
if (statep)
io_submit_state_end(statep);
- return submit ? submit : ret;
+ return submit;
}
static unsigned io_cqring_events(struct io_cq_ring *ring)
@@ -2239,10 +2320,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
mmgrab(current->mm);
ctx->sqo_mm = current->mm;
- ret = -EINVAL;
- if (!cpu_possible(p->sq_thread_cpu))
- goto err;
-
if (ctx->flags & IORING_SETUP_SQPOLL) {
ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
@@ -2253,11 +2330,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
ctx->sq_thread_idle = HZ;
if (p->flags & IORING_SETUP_SQ_AFF) {
- int cpu;
+ int cpu = array_index_nospec(p->sq_thread_cpu,
+ nr_cpu_ids);
- cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
ret = -EINVAL;
- if (!cpu_possible(p->sq_thread_cpu))
+ if (!cpu_possible(cpu))
goto err;
ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
@@ -2320,8 +2397,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
static void io_mem_free(void *ptr)
{
- struct page *page = virt_to_head_page(ptr);
+ struct page *page;
+
+ if (!ptr)
+ return;
+ page = virt_to_head_page(ptr);
if (put_page_testzero(page))
free_compound_page(page);
}
@@ -2362,7 +2443,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
if (ctx->account_mem)
io_unaccount_mem(ctx->user, imu->nr_bvecs);
- kfree(imu->bvec);
+ kvfree(imu->bvec);
imu->nr_bvecs = 0;
}
@@ -2454,9 +2535,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
if (!pages || nr_pages > got_pages) {
kfree(vmas);
kfree(pages);
- pages = kmalloc_array(nr_pages, sizeof(struct page *),
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *),
GFP_KERNEL);
- vmas = kmalloc_array(nr_pages,
+ vmas = kvmalloc_array(nr_pages,
sizeof(struct vm_area_struct *),
GFP_KERNEL);
if (!pages || !vmas) {
@@ -2468,7 +2549,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
got_pages = nr_pages;
}
- imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+ imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
GFP_KERNEL);
ret = -ENOMEM;
if (!imu->bvec) {
@@ -2507,6 +2588,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
}
if (ctx->account_mem)
io_unaccount_mem(ctx->user, nr_pages);
+ kvfree(imu->bvec);
goto err;
}
@@ -2529,12 +2611,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
ctx->nr_user_bufs++;
}
- kfree(pages);
- kfree(vmas);
+ kvfree(pages);
+ kvfree(vmas);
return 0;
err:
- kfree(pages);
- kfree(vmas);
+ kvfree(pages);
+ kvfree(vmas);
io_sqe_buffer_unregister(ctx);
return ret;
}
@@ -2572,9 +2654,13 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
__poll_t mask = 0;
poll_wait(file, &ctx->cq_wait, wait);
- /* See comment at the top of this file */
+ /*
+ * synchronizes with barrier from wq_has_sleeper call in
+ * io_commit_cqring
+ */
smp_rmb();
- if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
+ if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head !=
+ ctx->sq_ring->ring_entries)
mask |= EPOLLOUT | EPOLLWRNORM;
if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
mask |= EPOLLIN | EPOLLRDNORM;
@@ -2685,24 +2771,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
mutex_lock(&ctx->uring_lock);
submitted = io_ring_submit(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
-
- if (submitted < 0)
- goto out_ctx;
}
if (flags & IORING_ENTER_GETEVENTS) {
unsigned nr_events = 0;
min_complete = min(min_complete, ctx->cq_entries);
- /*
- * The application could have included the 'to_submit' count
- * in how many events it wanted to wait for. If we failed to
- * submit the desired count, we may need to adjust the number
- * of events to poll/wait for.
- */
- if (submitted < to_submit)
- min_complete = min_t(unsigned, submitted, min_complete);
-
if (ctx->flags & IORING_SETUP_IOPOLL) {
mutex_lock(&ctx->uring_lock);
ret = io_iopoll_check(ctx, &nr_events, min_complete);
@@ -2748,17 +2822,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return -EOVERFLOW;
ctx->sq_sqes = io_mem_alloc(size);
- if (!ctx->sq_sqes) {
- io_mem_free(ctx->sq_ring);
+ if (!ctx->sq_sqes)
return -ENOMEM;
- }
cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
- if (!cq_ring) {
- io_mem_free(ctx->sq_ring);
- io_mem_free(ctx->sq_sqes);
+ if (!cq_ring)
return -ENOMEM;
- }
ctx->cq_ring = cq_ring;
cq_ring->ring_mask = p->cq_entries - 1;
@@ -2934,6 +3003,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
{
int ret;
+ /*
+ * We're inside the ring mutex, if the ref is already dying, then
+ * someone else killed the ctx or is already going through
+ * io_uring_register().
+ */
+ if (percpu_ref_is_dying(&ctx->refs))
+ return -ENXIO;
+
percpu_ref_kill(&ctx->refs);
/*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 6b9c27548997..63c6bb1f8c4d 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -346,10 +346,16 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
__kernel_fsid_t fsid = {};
fsnotify_foreach_obj_type(type) {
+ struct fsnotify_mark_connector *conn;
+
if (!fsnotify_iter_should_report_type(iter_info, type))
continue;
- fsid = iter_info->marks[type]->connector->fsid;
+ conn = READ_ONCE(iter_info->marks[type]->connector);
+ /* Mark is just getting destroyed or created? */
+ if (!conn)
+ continue;
+ fsid = conn->fsid;
if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
continue;
return fsid;
@@ -408,8 +414,12 @@ static int fanotify_handle_event(struct fsnotify_group *group,
return 0;
}
- if (FAN_GROUP_FLAG(group, FAN_REPORT_FID))
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
fsid = fanotify_get_fsid(iter_info);
+ /* Racing with mark destruction or creation? */
+ if (!fsid.val[0] && !fsid.val[1])
+ return 0;
+ }
event = fanotify_alloc_event(group, inode, mask, data, data_type,
&fsid);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d593d4269561..22acb0a79b53 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -239,13 +239,13 @@ static void fsnotify_drop_object(unsigned int type, void *objp)
void fsnotify_put_mark(struct fsnotify_mark *mark)
{
- struct fsnotify_mark_connector *conn;
+ struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector);
void *objp = NULL;
unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
bool free_conn = false;
/* Catch marks that were actually never attached to object */
- if (!mark->connector) {
+ if (!conn) {
if (refcount_dec_and_test(&mark->refcnt))
fsnotify_final_mark_destroy(mark);
return;
@@ -255,10 +255,9 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
* We have to be careful so that traversals of obj_list under lock can
* safely grab mark reference.
*/
- if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+ if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock))
return;
- conn = mark->connector;
hlist_del_init_rcu(&mark->obj_list);
if (hlist_empty(&conn->list)) {
objp = fsnotify_detach_connector_from_object(conn, &type);
@@ -266,7 +265,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
} else {
__fsnotify_recalc_mask(conn);
}
- mark->connector = NULL;
+ WRITE_ONCE(mark->connector, NULL);
spin_unlock(&conn->lock);
fsnotify_drop_object(type, objp);
@@ -620,7 +619,7 @@ restart:
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
- mark->connector = conn;
+ WRITE_ONCE(mark->connector, conn);
out_err:
spin_unlock(&conn->lock);
spin_unlock(&mark->lock);
@@ -808,6 +807,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
refcount_set(&mark->refcnt, 1);
fsnotify_get_group(group);
mark->group = group;
+ WRITE_ONCE(mark->connector, NULL);
}
/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 2d61e5e8c863..c74570736b24 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1643,9 +1643,11 @@ static void drop_sysctl_table(struct ctl_table_header *header)
if (--header->nreg)
return;
- if (parent)
+ if (parent) {
put_links(header);
- start_unregistering(header);
+ start_unregistering(header);
+ }
+
if (!--header->count)
kfree_rcu(header, rcu);
diff --git a/fs/splice.c b/fs/splice.c
index 98943d9b219c..25212dcca2df 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -330,8 +330,8 @@ const struct pipe_buf_operations default_pipe_buf_ops = {
.get = generic_pipe_buf_get,
};
-static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
+int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
return 1;
}