summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2021-04-29 21:45:19 +0200
committerJiri Kosina <jkosina@suse.cz>2021-04-29 21:45:19 +0200
commite50fedec822efc7b7090f95862b782d91ca8aec0 (patch)
tree22dd6b534d00a1f7fc3823f0d88e0d43547fc87b /fs
parentcfc9bdfb6ba76de84a9ed8ee75dc56903b505a78 (diff)
parent35a927f2848bd79586c6374ebb99e4207f3b0c7f (diff)
downloadlinux-e50fedec822efc7b7090f95862b782d91ca8aec0.tar.bz2
Merge branch 'for-5.13/surface-system-aggregator-intergration' into for-linus
- Surface Aggregator Module support from Maximilian Luz
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c21
-rw-r--r--fs/block_dev.c12
-rw-r--r--fs/btrfs/block-group.c33
-rw-r--r--fs/btrfs/block-group.h9
-rw-r--r--fs/btrfs/compression.c68
-rw-r--r--fs/btrfs/ctree.h5
-rw-r--r--fs/btrfs/delayed-inode.c5
-rw-r--r--fs/btrfs/extent_io.c21
-rw-r--r--fs/btrfs/file.c7
-rw-r--r--fs/btrfs/free-space-cache.c21
-rw-r--r--fs/btrfs/inode.c46
-rw-r--r--fs/btrfs/ioctl.c19
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/qgroup.c8
-rw-r--r--fs/btrfs/qgroup.h2
-rw-r--r--fs/btrfs/raid56.c31
-rw-r--r--fs/btrfs/ref-verify.c4
-rw-r--r--fs/btrfs/reflink.c24
-rw-r--r--fs/btrfs/scrub.c9
-rw-r--r--fs/btrfs/send.c7
-rw-r--r--fs/btrfs/super.c4
-rw-r--r--fs/btrfs/tree-checker.c16
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/btrfs/xattr.c31
-rw-r--r--fs/btrfs/zlib.c5
-rw-r--r--fs/btrfs/zoned.c4
-rw-r--r--fs/btrfs/zstd.c6
-rw-r--r--fs/buffer.c7
-rw-r--r--fs/cifs/cifs_debug.c121
-rw-r--r--fs/cifs/cifs_swn.c2
-rw-r--r--fs/cifs/cifsacl.c379
-rw-r--r--fs/cifs/cifsacl.h4
-rw-r--r--fs/cifs/cifsencrypt.c6
-rw-r--r--fs/cifs/cifsfs.c15
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h11
-rw-r--r--fs/cifs/cifsproto.h2
-rw-r--r--fs/cifs/cifssmb.c6
-rw-r--r--fs/cifs/connect.c301
-rw-r--r--fs/cifs/dfs_cache.c33
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/fs_context.c75
-rw-r--r--fs/cifs/fs_context.h6
-rw-r--r--fs/cifs/inode.c23
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/smb2ops.c109
-rw-r--r--fs/cifs/smb2pdu.c22
-rw-r--r--fs/cifs/trace.h36
-rw-r--r--fs/cifs/transport.c63
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/debugfs/inode.c5
-rw-r--r--fs/direct-io.c6
-rw-r--r--fs/erofs/data.c4
-rw-r--r--fs/exec.c4
-rw-r--r--fs/ext4/.kunitconfig3
-rw-r--r--fs/ext4/Kconfig3
-rw-r--r--fs/ext4/extents.c16
-rw-r--r--fs/ext4/fast_commit.c29
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/readpage.c3
-rw-r--r--fs/ext4/super.c12
-rw-r--r--fs/f2fs/data.c3
-rw-r--r--fs/f2fs/node.c2
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/hugetlbfs/inode.c72
-rw-r--r--fs/inode.c1
-rw-r--r--fs/io-wq.c626
-rw-r--r--fs/io-wq.h34
-rw-r--r--fs/io_uring.c2003
-rw-r--r--fs/iomap/buffered-io.c11
-rw-r--r--fs/iomap/seek.c125
-rw-r--r--fs/jfs/super.c1
-rw-r--r--fs/mpage.c4
-rw-r--r--fs/namespace.c53
-rw-r--r--fs/nfs/blocklayout/blocklayout.c6
-rw-r--r--fs/nfs/file.c27
-rw-r--r--fs/nfs/fs_context.c35
-rw-r--r--fs/nfs/fscache.c4
-rw-r--r--fs/nfs/inode.c111
-rw-r--r--fs/nfs/nfs3acl.c1
-rw-r--r--fs/nfs/nfs4client.c1
-rw-r--r--fs/nfs/nfs4proc.c21
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/read.c204
-rw-r--r--fs/nfs/super.c7
-rw-r--r--fs/nfs/write.c37
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ntfs/layout.h4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c8
-rw-r--r--fs/ocfs2/dlm/dlmast.c10
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/base.c19
-rw-r--r--fs/proc/meminfo.c10
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/self.c7
-rw-r--r--fs/proc/thread_self.c7
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/pstore/inode.c2
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/ramfs/inode.c13
-rw-r--r--fs/xfs/libxfs/xfs_btree.c12
-rw-r--r--fs/xfs/xfs_aops.c17
-rw-r--r--fs/xfs/xfs_bio_io.c2
-rw-r--r--fs/xfs/xfs_buf.c4
-rw-r--r--fs/xfs/xfs_extent_busy.c14
-rw-r--r--fs/xfs/xfs_sysctl.c35
-rw-r--r--fs/xfs/xfs_trans.c33
-rw-r--r--fs/xfs/xfs_trans.h30
115 files changed, 2771 insertions, 2679 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 648eb4c4cf7f..8d97f0b45e9c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1139,9 +1139,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct super_block *sb, unsigned int flags)
{
umode_t mode;
- char ext[32];
- char tag_name[14];
- unsigned int i_nlink;
struct v9fs_session_info *v9ses = sb->s_fs_info;
struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -1159,18 +1156,18 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
inode->i_gid = stat->n_gid;
}
if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) {
- if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) {
+ if (v9fs_proto_dotu(v9ses)) {
+ unsigned int i_nlink;
/*
- * Hadlink support got added later to
- * to the .u extension. So there can be
- * server out there that doesn't support
- * this even with .u extension. So check
- * for non NULL stat->extension
+ * Hadlink support got added later to the .u extension.
+ * So there can be a server out there that doesn't
+ * support this even with .u extension. That would
+ * just leave us with stat->extension being an empty
+ * string, though.
*/
- strlcpy(ext, stat->extension, sizeof(ext));
/* HARDLINKCOUNT %u */
- sscanf(ext, "%13s %u", tag_name, &i_nlink);
- if (!strncmp(tag_name, "HARDLINKCOUNT", 13))
+ if (sscanf(stat->extension,
+ " HARDLINKCOUNT %u", &i_nlink) == 1)
set_nlink(inode, i_nlink);
}
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ec26179c8062..4aa1f88d5bf8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -221,7 +221,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
static ssize_t
__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
- int nr_pages)
+ unsigned int nr_pages)
{
struct file *file = iocb->ki_filp;
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
@@ -355,8 +355,8 @@ static void blkdev_bio_end_io(struct bio *bio)
}
}
-static ssize_t
-__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
+static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ unsigned int nr_pages)
{
struct file *file = iocb->ki_filp;
struct inode *inode = bdev_file_inode(file);
@@ -486,7 +486,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
- int nr_pages;
+ unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
@@ -495,7 +495,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
+ return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
}
static __init int blkdev_init(void)
@@ -1270,7 +1270,7 @@ rescan:
return ret;
}
/*
- * Only exported for for loop and dasd for historic reasons. Don't use in new
+ * Only exported for loop and dasd for historic reasons. Don't use in new
* code!
*/
EXPORT_SYMBOL_GPL(bdev_disk_changed);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5064be59dac5..744b99ddc28c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1162,6 +1162,11 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
+ if (cache->swap_extents) {
+ ret = -ETXTBSY;
+ goto out;
+ }
+
if (cache->ro) {
cache->ro++;
ret = 0;
@@ -2307,7 +2312,7 @@ again:
}
ret = inc_block_group_ro(cache, 0);
- if (!do_chunk_alloc)
+ if (!do_chunk_alloc || ret == -ETXTBSY)
goto unlock_out;
if (!ret)
goto out;
@@ -2316,6 +2321,8 @@ again:
if (ret < 0)
goto out;
ret = inc_block_group_ro(cache, 0);
+ if (ret == -ETXTBSY)
+ goto unlock_out;
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
@@ -3406,6 +3413,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list));
ASSERT(refcount_read(&block_group->refs) == 1);
+ ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
@@ -3472,3 +3480,26 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
__btrfs_remove_free_space_cache(block_group->free_space_ctl);
}
}
+
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
+{
+ bool ret = true;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ ret = false;
+ else
+ bg->swap_extents++;
+ spin_unlock(&bg->lock);
+
+ return ret;
+}
+
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
+{
+ spin_lock(&bg->lock);
+ ASSERT(!bg->ro);
+ ASSERT(bg->swap_extents >= amount);
+ bg->swap_extents -= amount;
+ spin_unlock(&bg->lock);
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 29678426247d..3ecc3372a5ce 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -186,6 +186,12 @@ struct btrfs_block_group {
/* Flag indicating this block group is placed on a sequential zone */
bool seq_zone;
+ /*
+ * Number of extents in this block group used for swap files.
+ * All accesses protected by the spinlock 'lock'.
+ */
+ int swap_extents;
+
/* Record locked full stripes for RAID5/6 block group */
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
@@ -312,4 +318,7 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
void btrfs_freeze_block_group(struct btrfs_block_group *cache);
void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
+
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6d203acfdeb3..3f4c832abfed 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -141,6 +141,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
const u32 csum_size = fs_info->csum_size;
+ const u32 sectorsize = fs_info->sectorsize;
struct page *page;
unsigned long i;
char *kaddr;
@@ -154,22 +155,34 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
shash->tfm = fs_info->csum_shash;
for (i = 0; i < cb->nr_pages; i++) {
+ u32 pg_offset;
+ u32 bytes_left = PAGE_SIZE;
page = cb->compressed_pages[i];
- kaddr = kmap_atomic(page);
- crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
- kunmap_atomic(kaddr);
-
- if (memcmp(&csum, cb_sum, csum_size)) {
- btrfs_print_data_csum_error(inode, disk_start,
- csum, cb_sum, cb->mirror_num);
- if (btrfs_io_bio(bio)->device)
- btrfs_dev_stat_inc_and_print(
- btrfs_io_bio(bio)->device,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- return -EIO;
+ /* Determine the remaining bytes inside the page first */
+ if (i == cb->nr_pages - 1)
+ bytes_left = cb->compressed_len - i * PAGE_SIZE;
+
+ /* Hash through the page sector by sector */
+ for (pg_offset = 0; pg_offset < bytes_left;
+ pg_offset += sectorsize) {
+ kaddr = kmap_atomic(page);
+ crypto_shash_digest(shash, kaddr + pg_offset,
+ sectorsize, csum);
+ kunmap_atomic(kaddr);
+
+ if (memcmp(&csum, cb_sum, csum_size) != 0) {
+ btrfs_print_data_csum_error(inode, disk_start,
+ csum, cb_sum, cb->mirror_num);
+ if (btrfs_io_bio(bio)->device)
+ btrfs_dev_stat_inc_and_print(
+ btrfs_io_bio(bio)->device,
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ return -EIO;
+ }
+ cb_sum += csum_size;
+ disk_start += sectorsize;
}
- cb_sum += csum_size;
}
return 0;
}
@@ -640,7 +653,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree,
page_offset(bio_first_page_all(bio)),
- PAGE_SIZE);
+ fs_info->sectorsize);
read_unlock(&em_tree->lock);
if (!em)
return BLK_STS_IOERR;
@@ -698,19 +711,30 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
refcount_set(&cb->pending_bios, 1);
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+ u32 pg_len = PAGE_SIZE;
int submit = 0;
+ /*
+ * To handle subpage case, we need to make sure the bio only
+ * covers the range we need.
+ *
+ * If we're at the last page, truncate the length to only cover
+ * the remaining part.
+ */
+ if (pg_index == nr_pages - 1)
+ pg_len = min_t(u32, PAGE_SIZE,
+ compressed_len - pg_index * PAGE_SIZE);
+
page = cb->compressed_pages[pg_index];
page->mapping = inode->i_mapping;
page->index = em_start >> PAGE_SHIFT;
if (comp_bio->bi_iter.bi_size)
- submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE,
+ submit = btrfs_bio_fits_in_stripe(page, pg_len,
comp_bio, 0);
page->mapping = NULL;
- if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
- PAGE_SIZE) {
+ if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
unsigned int nr_sectors;
ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
@@ -743,9 +767,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
- bio_add_page(comp_bio, page, PAGE_SIZE, 0);
+ bio_add_page(comp_bio, page, pg_len, 0);
}
- cur_disk_byte += PAGE_SIZE;
+ cur_disk_byte += pg_len;
}
ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
@@ -1237,7 +1261,6 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
unsigned long prev_start_byte;
unsigned long working_bytes = total_out - buf_start;
unsigned long bytes;
- char *kaddr;
struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter);
/*
@@ -1268,9 +1291,8 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
PAGE_SIZE - (buf_offset % PAGE_SIZE));
bytes = min(bytes, working_bytes);
- kaddr = kmap_atomic(bvec.bv_page);
- memcpy(kaddr + bvec.bv_offset, buf + buf_offset, bytes);
- kunmap_atomic(kaddr);
+ memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + buf_offset,
+ bytes);
flush_dcache_page(bvec.bv_page);
buf_offset += bytes;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bd659354d043..9ae776ab3967 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -524,6 +524,11 @@ struct btrfs_swapfile_pin {
* points to a struct btrfs_device.
*/
bool is_block_group;
+ /*
+ * Only used when 'is_block_group' is true and it is the number of
+ * extents used by a swapfile for this block group ('ptr' field).
+ */
+ int bg_extent_count;
};
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ec0b50b8c5d6..bf25401c9768 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -627,7 +627,8 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC, true);
if (ret < 0)
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
@@ -649,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
btrfs_ino(inode),
num_bytes, 1);
} else {
- btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize);
+ btrfs_qgroup_free_meta_prealloc(root, num_bytes);
}
return ret;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dfb3ead1175..4671c99d468d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3008,12 +3008,23 @@ readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_SHIFT;
- unsigned off;
- /* Zero out the end if this page straddles i_size */
- off = offset_in_page(i_size);
- if (page->index == end_index && off)
- zero_user_segment(page, off, PAGE_SIZE);
+ /*
+ * Zero out the remaining part if this range straddles
+ * i_size.
+ *
+ * Here we should only zero the range inside the bvec,
+ * not touch anything else.
+ *
+ * NOTE: i_size is exclusive while end is inclusive.
+ */
+ if (page->index == end_index && i_size <= end) {
+ u32 zero_start = max(offset_in_page(i_size),
+ offset_in_page(end));
+
+ zero_user_segment(page, zero_start,
+ offset_in_page(end) + 1);
+ }
}
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index be9e3900cce8..0e155f013839 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3260,8 +3260,11 @@ reserve_space:
goto out;
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
- if (ret)
+ if (ret) {
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
goto out;
+ }
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
@@ -3634,7 +3637,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
- return generic_file_buffered_read(iocb, to, ret);
+ return filemap_read(iocb, to, ret);
}
const struct file_operations btrfs_file_operations = {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5400294bd271..9988decd5717 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2555,7 +2555,12 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
to_unusable = size - to_free;
ctl->free_space += to_free;
- block_group->zone_unusable += to_unusable;
+ /*
+ * If the block group is read-only, we should account freed space into
+ * bytes_readonly.
+ */
+ if (!block_group->ro)
+ block_group->zone_unusable += to_unusable;
spin_unlock(&ctl->tree_lock);
if (!used) {
spin_lock(&block_group->lock);
@@ -2801,8 +2806,10 @@ static void __btrfs_return_cluster_to_free_space(
struct rb_node *node;
spin_lock(&cluster->lock);
- if (cluster->block_group != block_group)
- goto out;
+ if (cluster->block_group != block_group) {
+ spin_unlock(&cluster->lock);
+ return;
+ }
cluster->block_group = NULL;
cluster->window_start = 0;
@@ -2840,8 +2847,6 @@ static void __btrfs_return_cluster_to_free_space(
entry->offset, &entry->offset_index, bitmap);
}
cluster->root = RB_ROOT;
-
-out:
spin_unlock(&cluster->lock);
btrfs_put_block_group(block_group);
}
@@ -3125,8 +3130,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
entry->bytes -= bytes;
}
- if (entry->bytes == 0)
- rb_erase(&entry->offset_index, &cluster->root);
break;
}
out:
@@ -3143,7 +3146,10 @@ out:
ctl->free_space -= bytes;
if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
+
+ spin_lock(&cluster->lock);
if (entry->bytes == 0) {
+ rb_erase(&entry->offset_index, &cluster->root);
ctl->free_extents--;
if (entry->bitmap) {
kmem_cache_free(btrfs_free_space_bitmap_cachep,
@@ -3156,6 +3162,7 @@ out:
kmem_cache_free(btrfs_free_space_cachep, entry);
}
+ spin_unlock(&cluster->lock);
spin_unlock(&ctl->tree_lock);
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2e1c282c202d..35bfa0533f23 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1674,9 +1674,6 @@ next_slot:
*/
btrfs_release_path(path);
- /* If extent is RO, we must COW it */
- if (btrfs_extent_readonly(fs_info, disk_bytenr))
- goto out_check;
ret = btrfs_cross_ref_exist(root, ino,
found_key.offset -
extent_offset, disk_bytenr, false);
@@ -1723,6 +1720,7 @@ next_slot:
WARN_ON_ONCE(freespace_inode);
goto out_check;
}
+ /* If the extent's block group is RO, we must COW */
if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
goto out_check;
nocow = true;
@@ -6085,7 +6083,7 @@ static int btrfs_dirty_inode(struct inode *inode)
return PTR_ERR(trans);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (ret && ret == -ENOSPC) {
+ if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
@@ -10200,6 +10198,7 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
sp->ptr = ptr;
sp->inode = inode;
sp->is_block_group = is_block_group;
+ sp->bg_extent_count = 1;
spin_lock(&fs_info->swapfile_pins_lock);
p = &fs_info->swapfile_pins.rb_node;
@@ -10213,6 +10212,8 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
(sp->ptr == entry->ptr && sp->inode > entry->inode)) {
p = &(*p)->rb_right;
} else {
+ if (is_block_group)
+ entry->bg_extent_count++;
spin_unlock(&fs_info->swapfile_pins_lock);
kfree(sp);
return 1;
@@ -10238,8 +10239,11 @@ static void btrfs_free_swapfile_pins(struct inode *inode)
sp = rb_entry(node, struct btrfs_swapfile_pin, node);
if (sp->inode == inode) {
rb_erase(&sp->node, &fs_info->swapfile_pins);
- if (sp->is_block_group)
+ if (sp->is_block_group) {
+ btrfs_dec_block_group_swap_extents(sp->ptr,
+ sp->bg_extent_count);
btrfs_put_block_group(sp->ptr);
+ }
kfree(sp);
}
node = next;
@@ -10300,7 +10304,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
@@ -10351,13 +10356,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
}
+
+ /*
+ * Prevent snapshot creation while we are activating the swap file.
+ * We do not want to race with snapshot creation. If snapshot creation
+ * already started before we bumped nr_swapfiles from 0 to 1 and
+ * completes before the first write into the swap file after it is
+ * activated, than that write would fallback to COW.
+ */
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because snapshot creation is in progress");
+ return -EINVAL;
+ }
/*
* Snapshots can create extents which require COW even if NODATACOW is
* set. We use this counter to prevent snapshots. We must increment it
* before walking the extents because we don't want a concurrent
* snapshot to run after we've already checked the extents.
*/
- atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles);
+ atomic_inc(&root->nr_swapfiles);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
@@ -10454,6 +10473,17 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
+ if (!btrfs_inc_block_group_swap_extents(bg)) {
+ btrfs_warn(fs_info,
+ "block group for swapfile at %llu is read-only%s",
+ bg->start,
+ atomic_read(&fs_info->scrubs_running) ?
+ " (scrub running)" : "");
+ btrfs_put_block_group(bg);
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = btrfs_add_swapfile_pin(inode, bg, true);
if (ret) {
btrfs_put_block_group(bg);
@@ -10492,6 +10522,8 @@ out:
if (ret)
btrfs_swap_deactivate(file);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+
btrfs_exclop_finish(fs_info);
if (ret)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 072e77726e94..e8d53fea4c61 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1936,7 +1936,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
- if (vol_args->size > PAGE_SIZE) {
+ u64 nums;
+
+ if (vol_args->size < sizeof(*inherit) ||
+ vol_args->size > PAGE_SIZE) {
ret = -EINVAL;
goto free_args;
}
@@ -1945,6 +1948,20 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
ret = PTR_ERR(inherit);
goto free_args;
}
+
+ if (inherit->num_qgroups > PAGE_SIZE ||
+ inherit->num_ref_copies > PAGE_SIZE ||
+ inherit->num_excl_copies > PAGE_SIZE) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
+
+ nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+ 2 * inherit->num_excl_copies;
+ if (vol_args->size != struct_size(inherit, qgroups, nums)) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
}
ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index aa9cd11f4b78..9084a950dc09 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -467,7 +467,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes = min_t(unsigned long, destlen, out_len - start_byte);
- kaddr = kmap_atomic(dest_page);
+ kaddr = kmap_local_page(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
/*
@@ -477,7 +477,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
*/
if (bytes < destlen)
memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 808370ada888..14ff388fd3bd 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3841,8 +3841,8 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
return num_bytes;
}
-static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce)
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3873,14 +3873,14 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
{
int ret;
- ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
if (ret <= 0 && ret != -EDQUOT)
return ret;
ret = try_flush_qgroup(root);
if (ret < 0)
return ret;
- return qgroup_reserve_meta(root, num_bytes, type, enforce);
+ return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
}
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 50dea9a2d8fb..7283e4f549af 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -361,6 +361,8 @@ int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start,
u64 len);
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
/* Reserve metadata space for pertrans and prealloc type */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8ec34ecb6d68..8c31357f08ed 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -249,8 +249,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
{
int i;
- char *s;
- char *d;
int ret;
ret = alloc_rbio_pages(rbio);
@@ -261,13 +259,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
if (!rbio->bio_pages[i])
continue;
- s = kmap(rbio->bio_pages[i]);
- d = kmap(rbio->stripe_pages[i]);
-
- copy_page(d, s);
-
- kunmap(rbio->bio_pages[i]);
- kunmap(rbio->stripe_pages[i]);
+ copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
SetPageUptodate(rbio->stripe_pages[i]);
}
set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2359,16 +2351,21 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
SetPageUptodate(p_page);
if (has_qstripe) {
+ /* RAID6, allocate and map temp space for the Q stripe */
q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!q_page) {
__free_page(p_page);
goto cleanup;
}
SetPageUptodate(q_page);
+ pointers[rbio->real_stripes - 1] = kmap(q_page);
}
atomic_set(&rbio->error, 0);
+ /* Map the parity stripe just once */
+ pointers[nr_data] = kmap(p_page);
+
for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
struct page *p;
void *parity;
@@ -2378,16 +2375,8 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
pointers[stripe] = kmap(p);
}
- /* then add the parity stripe */
- pointers[stripe++] = kmap(p_page);
-
if (has_qstripe) {
- /*
- * raid6, add the qstripe and call the
- * library function to fill in our p/q
- */
- pointers[stripe++] = kmap(q_page);
-
+ /* RAID6, call the library function to fill in our P/Q */
raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
pointers);
} else {
@@ -2408,12 +2397,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
for (stripe = 0; stripe < nr_data; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
- kunmap(p_page);
}
+ kunmap(p_page);
__free_page(p_page);
- if (q_page)
+ if (q_page) {
+ kunmap(q_page);
__free_page(q_page);
+ }
writeback:
/*
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 2b490becbe67..8e026de74c44 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -218,11 +218,11 @@ static void __print_stack_trace(struct btrfs_fs_info *fs_info,
stack_trace_print(ra->trace, ra->trace_len, 2);
}
#else
-static void inline __save_stack_trace(struct ref_action *ra)
+static inline void __save_stack_trace(struct ref_action *ra)
{
}
-static void inline __print_stack_trace(struct btrfs_fs_info *fs_info,
+static inline void __print_stack_trace(struct btrfs_fs_info *fs_info,
struct ref_action *ra)
{
btrfs_err(fs_info, " ref-verify: no stacktrace support");
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index b24396cf2f99..762881b777b3 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -106,12 +106,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
if (comp_type == BTRFS_COMPRESS_NONE) {
- char *map;
-
- map = kmap(page);
- memcpy(map, data_start, datal);
+ memcpy_to_page(page, 0, data_start, datal);
flush_dcache_page(page);
- kunmap(page);
} else {
ret = btrfs_decompress(comp_type, data_start, page, 0,
inline_size, datal);
@@ -553,6 +549,24 @@ process_slot:
*/
btrfs_release_path(path);
+ /*
+ * When using NO_HOLES and we are cloning a range that covers
+ * only a hole (no extents) into a range beyond the current
+ * i_size, punching a hole in the target range will not create
+ * an extent map defining a hole, because the range starts at or
+ * beyond current i_size. If the file previously had an i_size
+ * greater than the new i_size set by this clone operation, we
+ * need to make sure the next fsync is a full fsync, so that it
+ * detects and logs a hole covering a range from the current
+ * i_size to the new i_size. If the clone range covers extents,
+ * besides a hole, then we know the full sync flag was already
+ * set by previous calls to btrfs_replace_file_extents() that
+ * replaced file extent items.
+ */
+ if (last_dest_end >= i_size_read(inode))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
ret = btrfs_replace_file_extents(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
if (ret)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 582df11d298a..c2900ebf767a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3767,6 +3767,13 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* commit_transactions.
*/
ro_set = 0;
+ } else if (ret == -ETXTBSY) {
+ btrfs_warn(fs_info,
+ "skipping scrub of block group %llu due to active swapfile",
+ cache->start);
+ scrub_pause_off(fs_info);
+ ret = 0;
+ goto skip_unfreeze;
} else {
btrfs_warn(fs_info,
"failed setting block group ro: %d", ret);
@@ -3862,7 +3869,7 @@ done:
} else {
spin_unlock(&cache->lock);
}
-
+skip_unfreeze:
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
if (ret)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f87878274e9f..8f323859156b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4932,7 +4932,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
struct page *page;
- char *addr;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
@@ -4985,10 +4984,8 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
}
}
- addr = kmap(page);
- memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset,
- cur_len);
- kunmap(page);
+ memcpy_from_page(sctx->send_buf + sctx->send_size, page,
+ pg_offset, cur_len);
unlock_page(page);
put_page(page);
index++;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f8435641b912..f7a4ad86adee 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1918,8 +1918,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
- btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
(!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
btrfs_warn(fs_info,
"remount supports changing free space tree only from ro to rw");
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 582061c7b547..f4ade821307d 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1453,22 +1453,14 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
return -EUCLEAN;
}
for (; ptr < end; ptr += sizeof(*dref)) {
- u64 root_objectid;
- u64 owner;
u64 offset;
- u64 hash;
+ /*
+ * We cannot check the extent_data_ref hash due to possible
+ * overflow from the leaf due to hash collisions.
+ */
dref = (struct btrfs_extent_data_ref *)ptr;
- root_objectid = btrfs_extent_data_ref_root(leaf, dref);
- owner = btrfs_extent_data_ref_objectid(leaf, dref);
offset = btrfs_extent_data_ref_offset(leaf, dref);
- hash = hash_extent_data_ref(root_objectid, owner, offset);
- if (unlikely(hash != key->offset)) {
- extent_err(leaf, slot,
- "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx",
- hash, key->offset);
- return -EUCLEAN;
- }
if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid extent data backref offset, have %llu expect aligned to %u",
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d90695c1ab6c..2f1acc9aea9e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3174,16 +3174,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
root_log_ctx.log_transid = log_root_tree->log_transid;
if (btrfs_is_zoned(fs_info)) {
- mutex_lock(&fs_info->tree_root->log_mutex);
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
}
- mutex_unlock(&fs_info->tree_root->log_mutex);
}
/*
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b025102e435f..8a4514283a4b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -229,11 +229,33 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
+ const bool start_trans = (current->journal_info == NULL);
int ret;
- trans = btrfs_start_transaction(root, 2);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (start_trans) {
+ /*
+ * 1 unit for inserting/updating/deleting the xattr
+ * 1 unit for the inode item update
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ } else {
+ /*
+ * This can happen when smack is enabled and a directory is being
+ * created. It happens through d_instantiate_new(), which calls
+ * smack_d_instantiate(), which in turn calls __vfs_setxattr() to
+ * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the
+ * inode. We have already reserved space for the xattr and inode
+ * update at btrfs_mkdir(), so just use the transaction handle.
+ * We don't join or start a transaction, as that will reset the
+ * block_rsv of the handle and trigger a warning for the start
+ * case.
+ */
+ ASSERT(strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) == 0);
+ trans = current->journal_info;
+ }
ret = btrfs_setxattr(trans, inode, name, value, size, flags);
if (ret)
@@ -244,7 +266,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
BUG_ON(ret);
out:
- btrfs_end_transaction(trans);
+ if (start_trans)
+ btrfs_end_transaction(trans);
return ret;
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 05615a1099db..d524acf7b3e5 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -432,9 +432,8 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
PAGE_SIZE - (buf_offset % PAGE_SIZE));
bytes = min(bytes, bytes_left);
- kaddr = kmap_atomic(dest_page);
- memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
- kunmap_atomic(kaddr);
+ memcpy_to_page(dest_page, pg_offset,
+ workspace->buf + buf_offset, bytes);
pg_offset += bytes;
bytes_left -= bytes;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index d0eb0c8d6269..1f972b75a9ab 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -269,7 +269,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
sector_t sector = 0;
struct blk_zone *zones = NULL;
unsigned int i, nreported = 0, nr_zones;
- unsigned int zone_sectors;
+ sector_t zone_sectors;
char *model, *emulated;
int ret;
@@ -658,7 +658,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret)
{
struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
- unsigned int zone_sectors;
+ sector_t zone_sectors;
u32 sb_zone;
int ret;
u8 zone_sectors_shift;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 9a4871636c6c..8e9626d63976 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -688,10 +688,8 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
bytes = min_t(unsigned long, destlen - pg_offset,
workspace->out_buf.size - buf_offset);
- kaddr = kmap_atomic(dest_page);
- memcpy(kaddr + pg_offset, workspace->out_buf.dst + buf_offset,
- bytes);
- kunmap_atomic(kaddr);
+ memcpy_to_page(dest_page, pg_offset,
+ workspace->out_buf.dst + buf_offset, bytes);
pg_offset += bytes;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 32647d2011df..0cb7ffd4977c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -847,7 +847,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
if (retry)
gfp |= __GFP_NOFAIL;
- memcg = get_mem_cgroup_from_page(page);
+ /* The page lock pins the memcg */
+ memcg = page_memcg(page);
old_memcg = set_active_memcg(memcg);
head = NULL;
@@ -868,7 +869,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
}
out:
set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
return head;
/*
* In case anything failed, we just free everything we got.
@@ -2083,7 +2083,8 @@ static int __block_commit_write(struct inode *inode, struct page *page,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
- clear_buffer_new(bh);
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
block_start = block_end;
bh = bh->b_this_page;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index aa697ccfa9dc..3aedc484e440 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -133,11 +133,12 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan)
{
struct TCP_Server_Info *server = chan->server;
- seq_printf(m, "\t\tChannel %d Number of credits: %d Dialect 0x%x "
- "TCP status: %d Instance: %d Local Users To Server: %d "
- "SecMode: 0x%x Req On Wire: %d In Send: %d "
- "In MaxReq Wait: %d\n",
- i+1,
+ seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx"
+ "\n\t\tNumber of credits: %d Dialect 0x%x"
+ "\n\t\tTCP status: %d Instance: %d"
+ "\n\t\tLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d"
+ "\n\t\tIn Send: %d In MaxReq Wait: %d",
+ i+1, server->conn_id,
server->credits,
server->dialect,
server->tcpStatus,
@@ -227,7 +228,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
- int i, j;
+ int c, i, j;
seq_puts(m,
"Display Internal CIFS Data Structures for Debugging\n"
@@ -275,14 +276,25 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_putc(m, '\n');
seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize);
seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
- seq_printf(m, "Servers:");
- i = 0;
+ seq_printf(m, "\nServers: ");
+
+ c = 0;
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp1, &cifs_tcp_ses_list) {
server = list_entry(tmp1, struct TCP_Server_Info,
tcp_ses_list);
+ /* channel info will be printed as a part of sessions below */
+ if (server->is_channel)
+ continue;
+
+ c++;
+ seq_printf(m, "\n%d) ConnectionId: 0x%llx ",
+ c, server->conn_id);
+
+ if (server->hostname)
+ seq_printf(m, "Hostname: %s ", server->hostname);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (!server->rdma)
goto skip_rdma;
@@ -362,46 +374,48 @@ skip_rdma:
if (server->posix_ext_supported)
seq_printf(m, " posix");
- i++;
+ if (server->rdma)
+ seq_printf(m, "\nRDMA ");
+ seq_printf(m, "\nTCP status: %d Instance: %d"
+ "\nLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d",
+ server->tcpStatus,
+ server->reconnect_instance,
+ server->srv_count,
+ server->sec_mode, in_flight(server));
+
+ seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d",
+ atomic_read(&server->in_send),
+ atomic_read(&server->num_waiters));
+
+ seq_printf(m, "\n\n\tSessions: ");
+ i = 0;
list_for_each(tmp2, &server->smb_ses_list) {
ses = list_entry(tmp2, struct cifs_ses,
smb_ses_list);
+ i++;
if ((ses->serverDomain == NULL) ||
(ses->serverOS == NULL) ||
(ses->serverNOS == NULL)) {
- seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d ",
- i, ses->serverName, ses->ses_count,
+ seq_printf(m, "\n\t%d) Address: %s Uses: %d Capability: 0x%x\tSession Status: %d ",
+ i, ses->ip_addr, ses->ses_count,
ses->capabilities, ses->status);
if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST)
- seq_printf(m, "Guest\t");
+ seq_printf(m, "Guest ");
else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL)
- seq_printf(m, "Anonymous\t");
+ seq_printf(m, "Anonymous ");
} else {
seq_printf(m,
- "\n%d) Name: %s Domain: %s Uses: %d OS:"
- " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
- " session status: %d ",
- i, ses->serverName, ses->serverDomain,
+ "\n\t%d) Name: %s Domain: %s Uses: %d OS: %s "
+ "\n\tNOS: %s\tCapability: 0x%x"
+ "\n\tSMB session status: %d ",
+ i, ses->ip_addr, ses->serverDomain,
ses->ses_count, ses->serverOS, ses->serverNOS,
ses->capabilities, ses->status);
}
- seq_printf(m,"Security type: %s\n",
+ seq_printf(m, "\n\tSecurity type: %s ",
get_security_type_str(server->ops->select_sectype(server, ses->sectype)));
- if (server->rdma)
- seq_printf(m, "RDMA\n\t");
- seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To "
- "Server: %d SecMode: 0x%x Req On Wire: %d",
- server->tcpStatus,
- server->reconnect_instance,
- server->srv_count,
- server->sec_mode, in_flight(server));
-
- seq_printf(m, " In Send: %d In MaxReq Wait: %d",
- atomic_read(&server->in_send),
- atomic_read(&server->num_waiters));
-
/* dump session id helpful for use with network trace */
seq_printf(m, " SessionId: 0x%llx", ses->Suid);
if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
@@ -414,13 +428,13 @@ skip_rdma:
from_kuid(&init_user_ns, ses->cred_uid));
if (ses->chan_count > 1) {
- seq_printf(m, "\n\n\tExtra Channels: %zu\n",
+ seq_printf(m, "\n\n\tExtra Channels: %zu ",
ses->chan_count-1);
for (j = 1; j < ses->chan_count; j++)
cifs_dump_channel(m, j, &ses->chans[j]);
}
- seq_puts(m, "\n\n\tShares:");
+ seq_puts(m, "\n\n\tShares: ");
j = 0;
seq_printf(m, "\n\t%d) IPC: ", j);
@@ -437,38 +451,43 @@ skip_rdma:
cifs_debug_tcon(m, tcon);
}
- seq_puts(m, "\n\tMIDs:\n");
-
- spin_lock(&GlobalMid_Lock);
- list_for_each(tmp3, &server->pending_mid_q) {
- mid_entry = list_entry(tmp3, struct mid_q_entry,
- qhead);
- seq_printf(m, "\tState: %d com: %d pid:"
- " %d cbdata: %p mid %llu\n",
- mid_entry->mid_state,
- le16_to_cpu(mid_entry->command),
- mid_entry->pid,
- mid_entry->callback_data,
- mid_entry->mid);
- }
- spin_unlock(&GlobalMid_Lock);
-
spin_lock(&ses->iface_lock);
if (ses->iface_count)
- seq_printf(m, "\n\tServer interfaces: %zu\n",
+ seq_printf(m, "\n\n\tServer interfaces: %zu",
ses->iface_count);
for (j = 0; j < ses->iface_count; j++) {
struct cifs_server_iface *iface;
iface = &ses->iface_list[j];
- seq_printf(m, "\t%d)", j);
+ seq_printf(m, "\n\t%d)", j+1);
cifs_dump_iface(m, iface);
if (is_ses_using_iface(ses, iface))
seq_puts(m, "\t\t[CONNECTED]\n");
}
spin_unlock(&ses->iface_lock);
}
+ if (i == 0)
+ seq_printf(m, "\n\t\t[NONE]");
+
+ seq_puts(m, "\n\n\tMIDs: ");
+ spin_lock(&GlobalMid_Lock);
+ list_for_each(tmp3, &server->pending_mid_q) {
+ mid_entry = list_entry(tmp3, struct mid_q_entry,
+ qhead);
+ seq_printf(m, "\n\tState: %d com: %d pid:"
+ " %d cbdata: %p mid %llu\n",
+ mid_entry->mid_state,
+ le16_to_cpu(mid_entry->command),
+ mid_entry->pid,
+ mid_entry->callback_data,
+ mid_entry->mid);
+ }
+ spin_unlock(&GlobalMid_Lock);
+ seq_printf(m, "\n--\n");
}
+ if (c == 0)
+ seq_printf(m, "\n\t[NONE]");
+
spin_unlock(&cifs_tcp_ses_lock);
seq_putc(m, '\n');
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index d35f599aa00e..f2d730fffccb 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -272,7 +272,7 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon)
if (IS_ERR(share_name)) {
int ret;
- ret = PTR_ERR(net_name);
+ ret = PTR_ERR(share_name);
cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n",
__func__, tcon->treeName, ret);
kfree(net_name);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 562913e2b3f2..9d29eb9660c2 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -267,10 +267,11 @@ is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group)
return true; /* well known sid found, uid returned */
}
-static void
+static __u16
cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
{
int i;
+ __u16 size = 1 + 1 + 6;
dst->revision = src->revision;
dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
@@ -278,6 +279,9 @@ cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
dst->authority[i] = src->authority[i];
for (i = 0; i < dst->num_subauth; ++i)
dst->sub_auth[i] = src->sub_auth[i];
+ size += (dst->num_subauth * 4);
+
+ return size;
}
static int
@@ -521,8 +525,11 @@ exit_cifs_idmap(void)
}
/* copy ntsd, owner sid, and group sid from a security descriptor to another */
-static void copy_sec_desc(const struct cifs_ntsd *pntsd,
- struct cifs_ntsd *pnntsd, __u32 sidsoffset)
+static __u32 copy_sec_desc(const struct cifs_ntsd *pntsd,
+ struct cifs_ntsd *pnntsd,
+ __u32 sidsoffset,
+ struct cifs_sid *pownersid,
+ struct cifs_sid *pgrpsid)
{
struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
@@ -536,19 +543,25 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid));
/* copy owner sid */
- owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ if (pownersid)
+ owner_sid_ptr = pownersid;
+ else
+ owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr);
/* copy group sid */
- group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ if (pgrpsid)
+ group_sid_ptr = pgrpsid;
+ else
+ group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
le32_to_cpu(pntsd->gsidoffset));
ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
sizeof(struct cifs_sid));
cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr);
- return;
+ return sidsoffset + (2 * sizeof(struct cifs_sid));
}
@@ -663,6 +676,25 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
return;
}
+static __u16 cifs_copy_ace(struct cifs_ace *dst, struct cifs_ace *src, struct cifs_sid *psid)
+{
+ __u16 size = 1 + 1 + 2 + 4;
+
+ dst->type = src->type;
+ dst->flags = src->flags;
+ dst->access_req = src->access_req;
+
+ /* Check if there's a replacement sid specified */
+ if (psid)
+ size += cifs_copy_sid(&dst->sid, psid);
+ else
+ size += cifs_copy_sid(&dst->sid, &src->sid);
+
+ dst->size = cpu_to_le16(size);
+
+ return size;
+}
+
static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
const struct cifs_sid *psid, __u64 nmode,
umode_t bits, __u8 access_type,
@@ -907,29 +939,30 @@ unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace)
return ace_size;
}
-static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
- struct cifs_sid *pgrpsid, __u64 *pnmode, bool modefromsid)
+static void populate_new_aces(char *nacl_base,
+ struct cifs_sid *pownersid,
+ struct cifs_sid *pgrpsid,
+ __u64 *pnmode, u32 *pnum_aces, u16 *pnsize,
+ bool modefromsid)
{
- u16 size = 0;
- u32 num_aces = 0;
- struct cifs_acl *pnndacl;
__u64 nmode;
+ u32 num_aces = 0;
+ u16 nsize = 0;
__u64 user_mode;
__u64 group_mode;
__u64 other_mode;
__u64 deny_user_mode = 0;
__u64 deny_group_mode = 0;
bool sticky_set = false;
-
- pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl));
+ struct cifs_ace *pnntace = NULL;
nmode = *pnmode;
+ num_aces = *pnum_aces;
+ nsize = *pnsize;
if (modefromsid) {
- struct cifs_ace *pntace =
- (struct cifs_ace *)((char *)pnndacl + size);
-
- size += setup_special_mode_ACE(pntace, nmode);
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += setup_special_mode_ACE(pnntace, nmode);
num_aces++;
goto set_size;
}
@@ -966,40 +999,170 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
sticky_set = true;
if (deny_user_mode) {
- size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
- pownersid, deny_user_mode, 0700, ACCESS_DENIED, false);
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += fill_ace_for_sid(pnntace, pownersid, deny_user_mode,
+ 0700, ACCESS_DENIED, false);
num_aces++;
}
+
/* Group DENY ACE does not conflict with owner ALLOW ACE. Keep in preferred order*/
if (deny_group_mode && !(deny_group_mode & (user_mode >> 3))) {
- size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
- pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false);
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode,
+ 0070, ACCESS_DENIED, false);
num_aces++;
}
- size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size),
- pownersid, user_mode, 0700, ACCESS_ALLOWED, true);
+
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += fill_ace_for_sid(pnntace, pownersid, user_mode,
+ 0700, ACCESS_ALLOWED, true);
num_aces++;
+
/* Group DENY ACE conflicts with owner ALLOW ACE. So keep it after. */
if (deny_group_mode && (deny_group_mode & (user_mode >> 3))) {
- size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
- pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false);
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode,
+ 0070, ACCESS_DENIED, false);
num_aces++;
}
- size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
- pgrpsid, group_mode, 0070, ACCESS_ALLOWED, !sticky_set);
+
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += fill_ace_for_sid(pnntace, pgrpsid, group_mode,
+ 0070, ACCESS_ALLOWED, !sticky_set);
num_aces++;
- size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size),
- &sid_everyone, other_mode, 0007, ACCESS_ALLOWED, !sticky_set);
+
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += fill_ace_for_sid(pnntace, &sid_everyone, other_mode,
+ 0007, ACCESS_ALLOWED, !sticky_set);
num_aces++;
set_size:
+ *pnum_aces = num_aces;
+ *pnsize = nsize;
+}
+
+static __u16 replace_sids_and_copy_aces(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
+ struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
+ struct cifs_sid *pnownersid, struct cifs_sid *pngrpsid)
+{
+ int i;
+ u16 size = 0;
+ struct cifs_ace *pntace = NULL;
+ char *acl_base = NULL;
+ u32 src_num_aces = 0;
+ u16 nsize = 0;
+ struct cifs_ace *pnntace = NULL;
+ char *nacl_base = NULL;
+ u16 ace_size = 0;
+
+ acl_base = (char *)pdacl;
+ size = sizeof(struct cifs_acl);
+ src_num_aces = le32_to_cpu(pdacl->num_aces);
+
+ nacl_base = (char *)pndacl;
+ nsize = sizeof(struct cifs_acl);
+
+ /* Go through all the ACEs */
+ for (i = 0; i < src_num_aces; ++i) {
+ pntace = (struct cifs_ace *) (acl_base + size);
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+
+ if (pnownersid && compare_sids(&pntace->sid, pownersid) == 0)
+ ace_size = cifs_copy_ace(pnntace, pntace, pnownersid);
+ else if (pngrpsid && compare_sids(&pntace->sid, pgrpsid) == 0)
+ ace_size = cifs_copy_ace(pnntace, pntace, pngrpsid);
+ else
+ ace_size = cifs_copy_ace(pnntace, pntace, NULL);
+
+ size += le16_to_cpu(pntace->size);
+ nsize += ace_size;
+ }
+
+ return nsize;
+}
+
+static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
+ struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
+ __u64 *pnmode, bool mode_from_sid)
+{
+ int i;
+ u16 size = 0;
+ struct cifs_ace *pntace = NULL;
+ char *acl_base = NULL;
+ u32 src_num_aces = 0;
+ u16 nsize = 0;
+ struct cifs_ace *pnntace = NULL;
+ char *nacl_base = NULL;
+ u32 num_aces = 0;
+ __u64 nmode;
+ bool new_aces_set = false;
+
+ /* Assuming that pndacl and pnmode are never NULL */
+ nmode = *pnmode;
+ nacl_base = (char *)pndacl;
+ nsize = sizeof(struct cifs_acl);
+
+ /* If pdacl is NULL, we don't have a src. Simply populate new ACL. */
+ if (!pdacl) {
+ populate_new_aces(nacl_base,
+ pownersid, pgrpsid,
+ pnmode, &num_aces, &nsize,
+ mode_from_sid);
+ goto finalize_dacl;
+ }
+
+ acl_base = (char *)pdacl;
+ size = sizeof(struct cifs_acl);
+ src_num_aces = le32_to_cpu(pdacl->num_aces);
+
+ /* Retain old ACEs which we can retain */
+ for (i = 0; i < src_num_aces; ++i) {
+ pntace = (struct cifs_ace *) (acl_base + size);
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+
+ if (!new_aces_set && (pntace->flags & INHERITED_ACE)) {
+ /* Place the new ACEs in between existing explicit and inherited */
+ populate_new_aces(nacl_base,
+ pownersid, pgrpsid,
+ pnmode, &num_aces, &nsize,
+ mode_from_sid);
+
+ new_aces_set = true;
+ }
+
+ /* If it's any one of the ACE we're replacing, skip! */
+ if ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) ||
+ (compare_sids(&pntace->sid, pownersid) == 0) ||
+ (compare_sids(&pntace->sid, pgrpsid) == 0) ||
+ (compare_sids(&pntace->sid, &sid_everyone) == 0) ||
+ (compare_sids(&pntace->sid, &sid_authusers) == 0)) {
+ goto next_ace;
+ }
+
+ nsize += cifs_copy_ace(pnntace, pntace, NULL);
+ num_aces++;
+
+next_ace:
+ size += le16_to_cpu(pntace->size);
+ }
+
+ /* If inherited ACEs are not present, place the new ones at the tail */
+ if (!new_aces_set) {
+ populate_new_aces(nacl_base,
+ pownersid, pgrpsid,
+ pnmode, &num_aces, &nsize,
+ mode_from_sid);
+
+ new_aces_set = true;
+ }
+
+finalize_dacl:
pndacl->num_aces = cpu_to_le32(num_aces);
- pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
+ pndacl->size = cpu_to_le16(nsize);
return 0;
}
-
static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
{
/* BB need to add parm so we can store the SID BB */
@@ -1094,7 +1257,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
/* Convert permission bits from mode to equivalent CIFS ACL */
static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
- __u32 secdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid,
+ __u32 secdesclen, __u32 *pnsecdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid,
bool mode_from_sid, bool id_from_sid, int *aclflag)
{
int rc = 0;
@@ -1102,39 +1265,59 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
__u32 ndacloffset;
__u32 sidsoffset;
struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
- struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
+ struct cifs_sid *nowner_sid_ptr = NULL, *ngroup_sid_ptr = NULL;
struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */
struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
+ char *end_of_acl = ((char *)pntsd) + secdesclen;
+ u16 size = 0;
- if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */
- owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
- le32_to_cpu(pntsd->osidoffset));
- group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
- le32_to_cpu(pntsd->gsidoffset));
- dacloffset = le32_to_cpu(pntsd->dacloffset);
+ dacloffset = le32_to_cpu(pntsd->dacloffset);
+ if (dacloffset) {
dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+ if (end_of_acl < (char *)dacl_ptr + le16_to_cpu(dacl_ptr->size)) {
+ cifs_dbg(VFS, "Server returned illegal ACL size\n");
+ return -EINVAL;
+ }
+ }
+
+ owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ le32_to_cpu(pntsd->osidoffset));
+ group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ le32_to_cpu(pntsd->gsidoffset));
+
+ if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */
ndacloffset = sizeof(struct cifs_ntsd);
ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
- ndacl_ptr->revision = dacl_ptr->revision;
- ndacl_ptr->size = 0;
- ndacl_ptr->num_aces = 0;
+ ndacl_ptr->revision =
+ dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION);
- rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
+ ndacl_ptr->size = cpu_to_le16(0);
+ ndacl_ptr->num_aces = cpu_to_le32(0);
+
+ rc = set_chmod_dacl(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr,
pnmode, mode_from_sid);
+
sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
- /* copy sec desc control portion & owner and group sids */
- copy_sec_desc(pntsd, pnntsd, sidsoffset);
- *aclflag = CIFS_ACL_DACL;
+ /* copy the non-dacl portion of secdesc */
+ *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset,
+ NULL, NULL);
+
+ *aclflag |= CIFS_ACL_DACL;
} else {
- memcpy(pnntsd, pntsd, secdesclen);
+ ndacloffset = sizeof(struct cifs_ntsd);
+ ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+ ndacl_ptr->revision =
+ dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION);
+ ndacl_ptr->num_aces = dacl_ptr->num_aces;
+
if (uid_valid(uid)) { /* chown */
uid_t id;
- owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
- le32_to_cpu(pnntsd->osidoffset));
nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
- if (!nowner_sid_ptr)
- return -ENOMEM;
+ if (!nowner_sid_ptr) {
+ rc = -ENOMEM;
+ goto chown_chgrp_exit;
+ }
id = from_kuid(&init_user_ns, uid);
if (id_from_sid) {
struct owner_sid *osid = (struct owner_sid *)nowner_sid_ptr;
@@ -1145,27 +1328,25 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
osid->SubAuthorities[0] = cpu_to_le32(88);
osid->SubAuthorities[1] = cpu_to_le32(1);
osid->SubAuthorities[2] = cpu_to_le32(id);
+
} else { /* lookup sid with upcall */
rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);
if (rc) {
cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n",
__func__, rc, id);
- kfree(nowner_sid_ptr);
- return rc;
+ goto chown_chgrp_exit;
}
}
- cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr);
- kfree(nowner_sid_ptr);
- *aclflag = CIFS_ACL_OWNER;
+ *aclflag |= CIFS_ACL_OWNER;
}
if (gid_valid(gid)) { /* chgrp */
gid_t id;
- group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
- le32_to_cpu(pnntsd->gsidoffset));
ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
- if (!ngroup_sid_ptr)
- return -ENOMEM;
+ if (!ngroup_sid_ptr) {
+ rc = -ENOMEM;
+ goto chown_chgrp_exit;
+ }
id = from_kgid(&init_user_ns, gid);
if (id_from_sid) {
struct owner_sid *gsid = (struct owner_sid *)ngroup_sid_ptr;
@@ -1176,19 +1357,35 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
gsid->SubAuthorities[0] = cpu_to_le32(88);
gsid->SubAuthorities[1] = cpu_to_le32(2);
gsid->SubAuthorities[2] = cpu_to_le32(id);
+
} else { /* lookup sid with upcall */
rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);
if (rc) {
cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n",
__func__, rc, id);
- kfree(ngroup_sid_ptr);
- return rc;
+ goto chown_chgrp_exit;
}
}
- cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr);
- kfree(ngroup_sid_ptr);
- *aclflag = CIFS_ACL_GROUP;
+ *aclflag |= CIFS_ACL_GROUP;
+ }
+
+ if (dacloffset) {
+ /* Replace ACEs for old owner with new one */
+ size = replace_sids_and_copy_aces(dacl_ptr, ndacl_ptr,
+ owner_sid_ptr, group_sid_ptr,
+ nowner_sid_ptr, ngroup_sid_ptr);
+ ndacl_ptr->size = cpu_to_le16(size);
}
+
+ sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+ /* copy the non-dacl portion of secdesc */
+ *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset,
+ nowner_sid_ptr, ngroup_sid_ptr);
+
+chown_chgrp_exit:
+ /* errors could jump here. So make sure we return soon after this */
+ kfree(nowner_sid_ptr);
+ kfree(ngroup_sid_ptr);
}
return rc;
@@ -1384,6 +1581,9 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
int rc = 0;
int aclflag = CIFS_ACL_DACL; /* default flag to set */
__u32 secdesclen = 0;
+ __u32 nsecdesclen = 0;
+ __u32 dacloffset = 0;
+ struct cifs_acl *dacl_ptr = NULL;
struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -1414,31 +1614,52 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
return rc;
}
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)
+ mode_from_sid = true;
+ else
+ mode_from_sid = false;
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL)
+ id_from_sid = true;
+ else
+ id_from_sid = false;
+
+ /* Potentially, five new ACEs can be added to the ACL for U,G,O mapping */
+ nsecdesclen = secdesclen;
+ if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */
+ if (mode_from_sid)
+ nsecdesclen += sizeof(struct cifs_ace);
+ else /* cifsacl */
+ nsecdesclen += 5 * sizeof(struct cifs_ace);
+ } else { /* chown */
+ /* When ownership changes, changes new owner sid length could be different */
+ nsecdesclen = sizeof(struct cifs_ntsd) + (sizeof(struct cifs_sid) * 2);
+ dacloffset = le32_to_cpu(pntsd->dacloffset);
+ if (dacloffset) {
+ dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+ if (mode_from_sid)
+ nsecdesclen +=
+ le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct cifs_ace);
+ else /* cifsacl */
+ nsecdesclen += le16_to_cpu(dacl_ptr->size);
+ }
+ }
+
/*
* Add three ACEs for owner, group, everyone getting rid of other ACEs
* as chmod disables ACEs and set the security descriptor. Allocate
* memory for the smb header, set security descriptor request security
* descriptor parameters, and secuirty descriptor itself
*/
- secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
- pnntsd = kmalloc(secdesclen, GFP_KERNEL);
+ nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN);
+ pnntsd = kmalloc(nsecdesclen, GFP_KERNEL);
if (!pnntsd) {
kfree(pntsd);
cifs_put_tlink(tlink);
return -ENOMEM;
}
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID)
- mode_from_sid = true;
- else
- mode_from_sid = false;
-
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL)
- id_from_sid = true;
- else
- id_from_sid = false;
-
- rc = build_sec_desc(pntsd, pnntsd, secdesclen, pnmode, uid, gid,
+ rc = build_sec_desc(pntsd, pnntsd, secdesclen, &nsecdesclen, pnmode, uid, gid,
mode_from_sid, id_from_sid, &aclflag);
cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
@@ -1448,7 +1669,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
if (!rc) {
/* Set the security descriptor */
- rc = ops->set_acl(pnntsd, secdesclen, inode, path, aclflag);
+ rc = ops->set_acl(pnntsd, nsecdesclen, inode, path, aclflag);
cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
}
cifs_put_tlink(tlink);
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index ff7fd0862e28..d9e704979d99 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -31,8 +31,8 @@
#define EXEC_BIT 0x1
#define ACL_OWNER_MASK 0700
-#define ACL_GROUP_MASK 0770
-#define ACL_EVERYONE_MASK 0777
+#define ACL_GROUP_MASK 0070
+#define ACL_EVERYONE_MASK 0007
#define UBITSHIFT 6
#define GBITSHIFT 3
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 51d53e4bdf6b..b8f1ff9a83f3 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -568,15 +568,15 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
return rc;
}
} else {
- /* We use ses->serverName if no domain name available */
- len = strlen(ses->serverName);
+ /* We use ses->ip_addr if no domain name available */
+ len = strlen(ses->ip_addr);
server = kmalloc(2 + (len * 2), GFP_KERNEL);
if (server == NULL) {
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
+ len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len,
nls_cp);
rc =
crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 38534e066cc7..d43e935d2df4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -638,8 +638,18 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
if (tcon->handle_timeout)
seq_printf(s, ",handletimeout=%u", tcon->handle_timeout);
- /* convert actimeo and display it in seconds */
- seq_printf(s, ",actimeo=%lu", cifs_sb->ctx->actimeo / HZ);
+
+ /*
+ * Display file and directory attribute timeout in seconds.
+ * If file and directory attribute timeout the same then actimeo
+ * was likely specified on mount
+ */
+ if (cifs_sb->ctx->acdirmax == cifs_sb->ctx->acregmax)
+ seq_printf(s, ",actimeo=%lu", cifs_sb->ctx->acregmax / HZ);
+ else {
+ seq_printf(s, ",acdirmax=%lu", cifs_sb->ctx->acdirmax / HZ);
+ seq_printf(s, ",acregmax=%lu", cifs_sb->ctx->acregmax / HZ);
+ }
if (tcon->ses->chan_max > 1)
seq_printf(s, ",multichannel,max_channels=%zu",
@@ -1526,6 +1536,7 @@ init_cifs(void)
*/
atomic_set(&sesInfoAllocCount, 0);
atomic_set(&tconInfoAllocCount, 0);
+ atomic_set(&tcpSesNextId, 0);
atomic_set(&tcpSesAllocCount, 0);
atomic_set(&tcpSesReconnectCount, 0);
atomic_set(&tconInfoReconnectCount, 0);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 71e9c6abb2a6..0d7ef150dbb2 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -165,5 +165,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.30"
+#define CIFS_VERSION "2.31"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 50fcb65920e8..3de3c5908a72 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -21,6 +21,7 @@
#include <linux/in.h>
#include <linux/in6.h>
+#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
@@ -504,6 +505,8 @@ struct smb_version_operations {
loff_t (*llseek)(struct file *, struct cifs_tcon *, loff_t, int);
/* Check for STATUS_IO_TIMEOUT */
bool (*is_status_io_timeout)(char *buf);
+ /* Check for STATUS_NETWORK_NAME_DELETED */
+ void (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
};
struct smb_version_values {
@@ -577,6 +580,7 @@ inc_rfc1001_len(void *buf, int count)
struct TCP_Server_Info {
struct list_head tcp_ses_list;
struct list_head smb_ses_list;
+ __u64 conn_id; /* connection identifier (useful for debugging) */
int srv_count; /* reference counter */
/* 15 character server name + 0x20 16th byte indicating type = srv */
char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
@@ -901,7 +905,7 @@ struct cifs_ses {
kuid_t linux_uid; /* overriding owner of files on the mount */
kuid_t cred_uid; /* owner of credentials */
unsigned int capabilities;
- char serverName[SERVER_NAME_LEN_WITH_NULL];
+ char ip_addr[INET6_ADDRSTRLEN + 1]; /* Max ipv6 (or v4) addr string len */
char *user_name; /* must not be null except during init of sess
and after mount option parsing we fill it */
char *domainName;
@@ -1704,7 +1708,9 @@ static inline bool is_retryable_error(int error)
#define CIFS_ECHO_OP 0x080 /* echo request */
#define CIFS_OBREAK_OP 0x0100 /* oplock break request */
#define CIFS_NEG_OP 0x0200 /* negotiate request */
-#define CIFS_OP_MASK 0x0380 /* mask request type */
+/* Lower bitmask values are reserved by others below. */
+#define CIFS_SESS_OP 0x2000 /* session setup request */
+#define CIFS_OP_MASK 0x2380 /* mask request type */
#define CIFS_HAS_CREDITS 0x0400 /* already has credits */
#define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */
@@ -1844,6 +1850,7 @@ GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */
*/
GLOBAL_EXTERN atomic_t sesInfoAllocCount;
GLOBAL_EXTERN atomic_t tconInfoAllocCount;
+GLOBAL_EXTERN atomic_t tcpSesNextId;
GLOBAL_EXTERN atomic_t tcpSesAllocCount;
GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 32f7a013402e..75ce6f742b8d 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -232,6 +232,8 @@ extern unsigned int setup_special_user_owner_ACE(struct cifs_ace *pace);
extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read);
+extern ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server,
+ size_t to_read);
extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
struct page *page,
unsigned int page_offset,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 0496934feecb..c279527aae92 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1451,9 +1451,9 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server)
while (remaining > 0) {
int length;
- length = cifs_read_from_socket(server, server->bigbuf,
- min_t(unsigned int, remaining,
- CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
+ length = cifs_discard_from_socket(server,
+ min_t(size_t, remaining,
+ CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
if (length < 0)
return length;
server->total_read += length;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4bb9decbbf27..112692300fb6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -242,7 +242,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->max_read = 0;
cifs_dbg(FYI, "Mark tcp session as need reconnect\n");
- trace_smb3_reconnect(server->CurrentMid, server->hostname);
+ trace_smb3_reconnect(server->CurrentMid, server->conn_id, server->hostname);
/* before reconnecting the tcp session, mark the smb session (uid)
and the tid bad so they are not used until reconnected */
@@ -564,6 +564,23 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
return cifs_readv_from_socket(server, &smb_msg);
}
+ssize_t
+cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read)
+{
+ struct msghdr smb_msg;
+
+ /*
+ * iov_iter_discard already sets smb_msg.type and count and iov_offset
+ * and cifs_readv_from_socket sets msg_control and msg_controllen
+ * so little to initialize in struct msghdr
+ */
+ smb_msg.msg_name = NULL;
+ smb_msg.msg_namelen = 0;
+ iov_iter_discard(&smb_msg.msg_iter, READ, to_read);
+
+ return cifs_readv_from_socket(server, &smb_msg);
+}
+
int
cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
unsigned int page_offset, unsigned int to_read)
@@ -846,7 +863,7 @@ static void
smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
- int scredits = server->credits;
+ int scredits, in_flight;
/*
* SMB1 does not use credits.
@@ -857,12 +874,14 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
if (shdr->CreditRequest) {
spin_lock(&server->req_lock);
server->credits += le16_to_cpu(shdr->CreditRequest);
+ scredits = server->credits;
+ in_flight = server->in_flight;
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
trace_smb3_add_credits(server->CurrentMid,
- server->hostname, scredits,
- le16_to_cpu(shdr->CreditRequest));
+ server->conn_id, server->hostname, scredits,
+ le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_server_dbg(FYI, "%s: added %u credits total=%d\n",
__func__, le16_to_cpu(shdr->CreditRequest),
scredits);
@@ -993,6 +1012,10 @@ next_pdu:
if (mids[i] != NULL) {
mids[i]->resp_buf_size = server->pdu_size;
+ if (bufs[i] && server->ops->is_network_name_deleted)
+ server->ops->is_network_name_deleted(bufs[i],
+ server);
+
if (!mids[i]->multiRsp || mids[i]->multiEnd)
mids[i]->callback(mids[i]);
@@ -1317,6 +1340,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx)
goto out_err_crypto_release;
}
+ tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId);
tcp_ses->noblockcnt = ctx->rootfs;
tcp_ses->noblocksnd = ctx->noblocksnd || ctx->rootfs;
tcp_ses->noautotune = ctx->noautotune;
@@ -1838,9 +1862,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
/* new SMB session uses our server ref */
ses->server = server;
if (server->dstaddr.ss_family == AF_INET6)
- sprintf(ses->serverName, "%pI6", &addr6->sin6_addr);
+ sprintf(ses->ip_addr, "%pI6", &addr6->sin6_addr);
else
- sprintf(ses->serverName, "%pI4", &addr->sin_addr);
+ sprintf(ses->ip_addr, "%pI4", &addr->sin_addr);
if (ctx->username) {
ses->user_name = kstrdup(ctx->username, GFP_KERNEL);
@@ -2269,7 +2293,9 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
if (strcmp(old->local_nls->charset, new->local_nls->charset))
return 0;
- if (old->ctx->actimeo != new->ctx->actimeo)
+ if (old->ctx->acregmax != new->ctx->acregmax)
+ return 0;
+ if (old->ctx->acdirmax != new->ctx->acdirmax)
return 0;
return 1;
@@ -2911,7 +2937,7 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
#ifdef CONFIG_CIFS_DFS_UPCALL
/*
* cifs_build_path_to_root returns full path to root when we do not have an
- * exiting connection (tcon)
+ * existing connection (tcon)
*/
static char *
build_unc_path_to_root(const struct smb3_fs_context *ctx,
@@ -3038,96 +3064,91 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it,
return 0;
}
-static int setup_dfs_tgt_conn(const char *path, const char *full_path,
- const struct dfs_cache_tgt_iterator *tgt_it,
- struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx,
- unsigned int *xid, struct TCP_Server_Info **server,
- struct cifs_ses **ses, struct cifs_tcon **tcon)
-{
- int rc;
- struct dfs_info3_param ref = {0};
- char *mdata = NULL;
- struct smb3_fs_context fake_ctx = {NULL};
- char *fake_devname = NULL;
-
- cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path);
-
- rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref);
- if (rc)
- return rc;
-
- mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options,
- full_path + 1, &ref,
- &fake_devname);
- free_dfs_info_param(&ref);
-
- if (IS_ERR(mdata)) {
- rc = PTR_ERR(mdata);
- mdata = NULL;
- } else
- rc = cifs_setup_volume_info(&fake_ctx, mdata, fake_devname);
-
- kfree(mdata);
- kfree(fake_devname);
-
- if (!rc) {
- /*
- * We use a 'fake_ctx' here because we need pass it down to the
- * mount_{get,put} functions to test connection against new DFS
- * targets.
- */
- mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon);
- rc = mount_get_conns(&fake_ctx, cifs_sb, xid, server, ses,
- tcon);
- if (!rc || (*server && *ses)) {
- /*
- * We were able to connect to new target server.
- * Update current context with new target server.
- */
- rc = update_vol_info(tgt_it, &fake_ctx, ctx);
- }
- }
- smb3_cleanup_fs_context_contents(&fake_ctx);
- return rc;
-}
-
static int do_dfs_failover(const char *path, const char *full_path, struct cifs_sb_info *cifs_sb,
struct smb3_fs_context *ctx, struct cifs_ses *root_ses,
unsigned int *xid, struct TCP_Server_Info **server,
struct cifs_ses **ses, struct cifs_tcon **tcon)
{
int rc;
- struct dfs_cache_tgt_list tgt_list;
+ struct dfs_cache_tgt_list tgt_list = {0};
struct dfs_cache_tgt_iterator *tgt_it = NULL;
+ struct smb3_fs_context tmp_ctx = {NULL};
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
return -EOPNOTSUPP;
+ cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, path, full_path);
+
rc = dfs_cache_noreq_find(path, NULL, &tgt_list);
if (rc)
return rc;
+ /*
+ * We use a 'tmp_ctx' here because we need pass it down to the mount_{get,put} functions to
+ * test connection against new DFS targets.
+ */
+ rc = smb3_fs_context_dup(&tmp_ctx, ctx);
+ if (rc)
+ goto out;
for (;;) {
+ struct dfs_info3_param ref = {0};
+ char *fake_devname = NULL, *mdata = NULL;
+
/* Get next DFS target server - if any */
rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it);
if (rc)
break;
- /* Connect to next DFS target */
- rc = setup_dfs_tgt_conn(path, full_path, tgt_it, cifs_sb, ctx, xid, server, ses,
- tcon);
- if (!rc || (*server && *ses))
+
+ rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref);
+ if (rc)
+ break;
+
+ cifs_dbg(FYI, "%s: old ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC,
+ tmp_ctx.prepath);
+
+ mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, full_path + 1, &ref,
+ &fake_devname);
+ free_dfs_info_param(&ref);
+
+ if (IS_ERR(mdata)) {
+ rc = PTR_ERR(mdata);
+ mdata = NULL;
+ } else
+ rc = cifs_setup_volume_info(&tmp_ctx, mdata, fake_devname);
+
+ kfree(mdata);
+ kfree(fake_devname);
+
+ if (rc)
break;
+
+ cifs_dbg(FYI, "%s: new ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC,
+ tmp_ctx.prepath);
+
+ mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon);
+ rc = mount_get_conns(&tmp_ctx, cifs_sb, xid, server, ses, tcon);
+ if (!rc || (*server && *ses)) {
+ /*
+ * We were able to connect to new target server. Update current context with
+ * new target server.
+ */
+ rc = update_vol_info(tgt_it, &tmp_ctx, ctx);
+ break;
+ }
}
if (!rc) {
+ cifs_dbg(FYI, "%s: final ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC,
+ tmp_ctx.prepath);
/*
- * Update DFS target hint in DFS referral cache with the target
- * server we successfully reconnected to.
+ * Update DFS target hint in DFS referral cache with the target server we
+ * successfully reconnected to.
*/
- rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb), path,
- tgt_it);
+ rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), path, tgt_it);
}
+
+out:
+ smb3_cleanup_fs_context_contents(&tmp_ctx);
dfs_cache_free_tgts(&tgt_list);
return rc;
}
@@ -3285,77 +3306,77 @@ static void put_root_ses(struct cifs_ses *ses)
cifs_put_smb_ses(ses);
}
-/* Check if a path component is remote and then update @dfs_path accordingly */
-static int check_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx,
- const unsigned int xid, struct TCP_Server_Info *server,
- struct cifs_tcon *tcon, char **dfs_path)
+/* Set up next dfs prefix path in @dfs_path */
+static int next_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx,
+ const unsigned int xid, struct TCP_Server_Info *server,
+ struct cifs_tcon *tcon, char **dfs_path)
{
- char *path, *s;
- char sep = CIFS_DIR_SEP(cifs_sb), tmp;
- char *npath;
- int rc = 0;
- int added_treename = tcon->Flags & SMB_SHARE_IS_IN_DFS;
- int skip = added_treename;
+ char *path, *npath;
+ int added_treename = is_tcon_dfs(tcon);
+ int rc;
path = cifs_build_path_to_root(ctx, cifs_sb, tcon, added_treename);
if (!path)
return -ENOMEM;
- /*
- * Walk through the path components in @path and check if they're accessible. In case any of
- * the components is -EREMOTE, then update @dfs_path with the next DFS referral request path
- * (NOT including the remaining components).
- */
- s = path;
- do {
- /* skip separators */
- while (*s && *s == sep)
- s++;
- if (!*s)
- break;
- /* next separator */
- while (*s && *s != sep)
- s++;
- /*
- * if the treename is added, we then have to skip the first
- * part within the separators
- */
- if (skip) {
- skip = 0;
- continue;
+ rc = is_path_remote(cifs_sb, ctx, xid, server, tcon);
+ if (rc == -EREMOTE) {
+ struct smb3_fs_context v = {NULL};
+ /* if @path contains a tree name, skip it in the prefix path */
+ if (added_treename) {
+ rc = smb3_parse_devname(path, &v);
+ if (rc)
+ goto out;
+ npath = build_unc_path_to_root(&v, cifs_sb, true);
+ smb3_cleanup_fs_context_contents(&v);
+ } else {
+ v.UNC = ctx->UNC;
+ v.prepath = path + 1;
+ npath = build_unc_path_to_root(&v, cifs_sb, true);
}
- tmp = *s;
- *s = 0;
- rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, path);
- if (rc && rc == -EREMOTE) {
- struct smb3_fs_context v = {NULL};
- /* if @path contains a tree name, skip it in the prefix path */
- if (added_treename) {
- rc = smb3_parse_devname(path, &v);
- if (rc)
- break;
- rc = -EREMOTE;
- npath = build_unc_path_to_root(&v, cifs_sb, true);
- smb3_cleanup_fs_context_contents(&v);
- } else {
- v.UNC = ctx->UNC;
- v.prepath = path + 1;
- npath = build_unc_path_to_root(&v, cifs_sb, true);
- }
- if (IS_ERR(npath)) {
- rc = PTR_ERR(npath);
- break;
- }
- kfree(*dfs_path);
- *dfs_path = npath;
+
+ if (IS_ERR(npath)) {
+ rc = PTR_ERR(npath);
+ goto out;
}
- *s = tmp;
- } while (rc == 0);
+ kfree(*dfs_path);
+ *dfs_path = npath;
+ rc = -EREMOTE;
+ }
+
+out:
kfree(path);
return rc;
}
+/* Check if resolved targets can handle any DFS referrals */
+static int is_referral_server(const char *ref_path, struct cifs_tcon *tcon, bool *ref_server)
+{
+ int rc;
+ struct dfs_info3_param ref = {0};
+
+ if (is_tcon_dfs(tcon)) {
+ *ref_server = true;
+ } else {
+ cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path);
+
+ rc = dfs_cache_noreq_find(ref_path, &ref, NULL);
+ if (rc) {
+ cifs_dbg(VFS, "%s: dfs_cache_noreq_find: failed (rc=%d)\n", __func__, rc);
+ return rc;
+ }
+ cifs_dbg(FYI, "%s: ref.flags=0x%x\n", __func__, ref.flags);
+ /*
+ * Check if all targets are capable of handling DFS referrals as per
+ * MS-DFSC 2.2.4 RESP_GET_DFS_REFERRAL.
+ */
+ *ref_server = !!(ref.flags & DFSREF_REFERRAL_SERVER);
+ free_dfs_info_param(&ref);
+ }
+ return 0;
+}
+
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
{
int rc = 0;
@@ -3367,18 +3388,19 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
char *ref_path = NULL, *full_path = NULL;
char *oldmnt = NULL;
char *mntdata = NULL;
+ bool ref_server = false;
rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon);
/*
- * Unconditionally try to get an DFS referral (even cached) to determine whether it is an
- * DFS mount.
+ * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally
+ * try to get an DFS referral (even cached) to determine whether it is an DFS mount.
*
* Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem
* to respond with PATH_NOT_COVERED to requests that include the prefix.
*/
- if (dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ctx->UNC + 1, NULL,
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) ||
+ dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ctx->UNC + 1, NULL,
NULL)) {
- /* No DFS referral was returned. Looks like a regular share. */
if (rc)
goto error;
/* Check if it is fully accessible and then mount it */
@@ -3432,13 +3454,18 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
break;
if (!tcon)
continue;
+
/* Make sure that requests go through new root servers */
- if (is_tcon_dfs(tcon)) {
+ rc = is_referral_server(ref_path + 1, tcon, &ref_server);
+ if (rc)
+ break;
+ if (ref_server) {
put_root_ses(root_ses);
set_root_ses(cifs_sb, ses, &root_ses);
}
- /* Check for remaining path components and then continue chasing them (-EREMOTE) */
- rc = check_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path);
+
+ /* Get next dfs path and then continue chasing them if -EREMOTE */
+ rc = next_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path);
/* Prevent recursion on broken link referrals */
if (rc == -EREMOTE && ++count > MAX_NESTED_LINKS)
rc = -ELOOP;
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 4950ab0486ae..098b4bc8da59 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -37,11 +37,12 @@ struct cache_dfs_tgt {
struct cache_entry {
struct hlist_node hlist;
const char *path;
- int ttl;
- int srvtype;
- int flags;
+ int hdr_flags; /* RESP_GET_DFS_REFERRAL.ReferralHeaderFlags */
+ int ttl; /* DFS_REREFERRAL_V3.TimeToLive */
+ int srvtype; /* DFS_REREFERRAL_V3.ServerType */
+ int ref_flags; /* DFS_REREFERRAL_V3.ReferralEntryFlags */
struct timespec64 etime;
- int path_consumed;
+ int path_consumed; /* RESP_GET_DFS_REFERRAL.PathConsumed */
int numtgts;
struct list_head tlist;
struct cache_dfs_tgt *tgthint;
@@ -166,14 +167,11 @@ static int dfscache_proc_show(struct seq_file *m, void *v)
continue;
seq_printf(m,
- "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
- "interlink=%s,path_consumed=%d,expired=%s\n",
- ce->path,
- ce->srvtype == DFS_TYPE_ROOT ? "root" : "link",
- ce->ttl, ce->etime.tv_nsec,
- IS_INTERLINK_SET(ce->flags) ? "yes" : "no",
- ce->path_consumed,
- cache_entry_expired(ce) ? "yes" : "no");
+ "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n",
+ ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link",
+ ce->ttl, ce->etime.tv_nsec, ce->ref_flags, ce->hdr_flags,
+ IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no",
+ ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no");
list_for_each_entry(t, &ce->tlist, list) {
seq_printf(m, " %s%s\n",
@@ -236,11 +234,12 @@ static inline void dump_tgts(const struct cache_entry *ce)
static inline void dump_ce(const struct cache_entry *ce)
{
- cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,interlink=%s,path_consumed=%d,expired=%s\n",
+ cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n",
ce->path,
ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl,
ce->etime.tv_nsec,
- IS_INTERLINK_SET(ce->flags) ? "yes" : "no",
+ ce->hdr_flags, ce->ref_flags,
+ IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no",
ce->path_consumed,
cache_entry_expired(ce) ? "yes" : "no");
dump_tgts(ce);
@@ -381,7 +380,8 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
ce->ttl = refs[0].ttl;
ce->etime = get_expire_time(ce->ttl);
ce->srvtype = refs[0].server_type;
- ce->flags = refs[0].ref_flag;
+ ce->hdr_flags = refs[0].flags;
+ ce->ref_flags = refs[0].ref_flag;
ce->path_consumed = refs[0].path_consumed;
for (i = 0; i < numrefs; i++) {
@@ -799,7 +799,8 @@ static int setup_referral(const char *path, struct cache_entry *ce,
ref->path_consumed = ce->path_consumed;
ref->ttl = ce->ttl;
ref->server_type = ce->srvtype;
- ref->ref_flag = ce->flags;
+ ref->ref_flag = ce->ref_flags;
+ ref->flags = ce->hdr_flags;
return 0;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6d001905c8e5..26de4329d161 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -580,7 +580,7 @@ int cifs_open(struct inode *inode, struct file *file)
} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
if (tcon->ses->serverNOS)
cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n",
- tcon->ses->serverName,
+ tcon->ses->ip_addr,
tcon->ses->serverNOS);
tcon->broken_posix_open = true;
} else if ((rc != -EIO) && (rc != -EREMOTE) &&
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 12a5da0230b5..892f51a21278 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -140,6 +140,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("rsize", Opt_rsize),
fsparam_u32("wsize", Opt_wsize),
fsparam_u32("actimeo", Opt_actimeo),
+ fsparam_u32("acdirmax", Opt_acdirmax),
+ fsparam_u32("acregmax", Opt_acregmax),
fsparam_u32("echo_interval", Opt_echo_interval),
fsparam_u32("max_credits", Opt_max_credits),
fsparam_u32("handletimeout", Opt_handletimeout),
@@ -397,7 +399,7 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3)
ctx->vals = &smb3any_values;
break;
case Smb_default:
- ctx->ops = &smb30_operations; /* currently identical with 3.0 */
+ ctx->ops = &smb30_operations;
ctx->vals = &smbdefault_values;
break;
default:
@@ -542,20 +544,37 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc,
/* BB Need to add support for sep= here TBD */
while ((key = strsep(&options, ",")) != NULL) {
- if (*key) {
- size_t v_len = 0;
- char *value = strchr(key, '=');
-
- if (value) {
- if (value == key)
- continue;
- *value++ = 0;
- v_len = strlen(value);
- }
- ret = vfs_parse_fs_string(fc, key, value, v_len);
- if (ret < 0)
- break;
+ size_t len;
+ char *value;
+
+ if (*key == 0)
+ break;
+
+ /* Check if following character is the deliminator If yes,
+ * we have encountered a double deliminator reset the NULL
+ * character to the deliminator
+ */
+ while (options && options[0] == ',') {
+ len = strlen(key);
+ strcpy(key + len, options);
+ options = strchr(options, ',');
+ if (options)
+ *options++ = 0;
+ }
+
+
+ len = 0;
+ value = strchr(key, '=');
+ if (value) {
+ if (value == key)
+ continue;
+ *value++ = 0;
+ len = strlen(value);
}
+
+ ret = vfs_parse_fs_string(fc, key, value, len);
+ if (ret < 0)
+ break;
}
return ret;
@@ -929,12 +948,31 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->wsize = result.uint_32;
ctx->got_wsize = true;
break;
+ case Opt_acregmax:
+ ctx->acregmax = HZ * result.uint_32;
+ if (ctx->acregmax > CIFS_MAX_ACTIMEO) {
+ cifs_dbg(VFS, "acregmax too large\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_acdirmax:
+ ctx->acdirmax = HZ * result.uint_32;
+ if (ctx->acdirmax > CIFS_MAX_ACTIMEO) {
+ cifs_dbg(VFS, "acdirmax too large\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
case Opt_actimeo:
- ctx->actimeo = HZ * result.uint_32;
- if (ctx->actimeo > CIFS_MAX_ACTIMEO) {
- cifs_dbg(VFS, "attribute cache timeout too large\n");
+ if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) {
+ cifs_dbg(VFS, "timeout too large\n");
goto cifs_parse_mount_err;
}
+ if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) ||
+ (ctx->acregmax != CIFS_DEF_ACTIMEO)) {
+ cifs_dbg(VFS, "actimeo ignored since acregmax or acdirmax specified\n");
+ break;
+ }
+ ctx->acdirmax = ctx->acregmax = HZ * result.uint_32;
break;
case Opt_echo_interval:
ctx->echo_interval = result.uint_32;
@@ -1361,7 +1399,8 @@ int smb3_init_fs_context(struct fs_context *fc)
/* default is to use strict cifs caching semantics */
ctx->strict_io = true;
- ctx->actimeo = CIFS_DEF_ACTIMEO;
+ ctx->acregmax = CIFS_DEF_ACTIMEO;
+ ctx->acdirmax = CIFS_DEF_ACTIMEO;
/* Most clients set timeout to 0, allows server to use its default */
ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index 1c44a460e2c0..87dd1f7168f2 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -118,6 +118,8 @@ enum cifs_param {
Opt_rsize,
Opt_wsize,
Opt_actimeo,
+ Opt_acdirmax,
+ Opt_acregmax,
Opt_echo_interval,
Opt_max_credits,
Opt_snapshot,
@@ -232,7 +234,9 @@ struct smb3_fs_context {
unsigned int wsize;
unsigned int min_offload;
bool sockopt_tcp_nodelay:1;
- unsigned long actimeo; /* attribute cache timeout (jiffies) */
+ /* attribute cache timemout for files and directories in jiffies */
+ unsigned long acregmax;
+ unsigned long acdirmax;
struct smb_version_operations *ops;
struct smb_version_values *vals;
char *prepath;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3e9c7bb23f26..7c61bc9573c0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2199,12 +2199,23 @@ cifs_inode_needs_reval(struct inode *inode)
if (!lookupCacheEnabled)
return true;
- if (!cifs_sb->ctx->actimeo)
- return true;
-
- if (!time_in_range(jiffies, cifs_i->time,
- cifs_i->time + cifs_sb->ctx->actimeo))
- return true;
+ /*
+ * depending on inode type, check if attribute caching disabled for
+ * files or directories
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ if (!cifs_sb->ctx->acdirmax)
+ return true;
+ if (!time_in_range(jiffies, cifs_i->time,
+ cifs_i->time + cifs_sb->ctx->acdirmax))
+ return true;
+ } else { /* file */
+ if (!cifs_sb->ctx->acregmax)
+ return true;
+ if (!time_in_range(jiffies, cifs_i->time,
+ cifs_i->time + cifs_sb->ctx->acregmax))
+ return true;
+ }
/* hardlinked files w/ noserverino get "special" treatment */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 213465718fa8..183a3a868d7b 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -218,7 +218,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
/* UNC and paths */
/* XXX: Use ses->server->hostname? */
- sprintf(unc, unc_fmt, ses->serverName);
+ sprintf(unc, unc_fmt, ses->ip_addr);
ctx.UNC = unc;
ctx.prepath = "";
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f19274857292..f5087295424c 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -63,17 +63,19 @@ smb2_add_credits(struct TCP_Server_Info *server,
const struct cifs_credits *credits, const int optype)
{
int *val, rc = -1;
+ int scredits, in_flight;
unsigned int add = credits->value;
unsigned int instance = credits->instance;
bool reconnect_detected = false;
+ bool reconnect_with_invalid_credits = false;
spin_lock(&server->req_lock);
val = server->ops->get_credits_field(server, optype);
/* eg found case where write overlapping reconnect messed up credits */
if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0))
- trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
- server->hostname, *val, add);
+ reconnect_with_invalid_credits = true;
+
if ((instance == 0) || (instance == server->reconnect_instance))
*val += add;
else
@@ -84,7 +86,9 @@ smb2_add_credits(struct TCP_Server_Info *server,
pr_warn_once("server overflowed SMB3 credits\n");
}
server->in_flight--;
- if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP)
+ if (server->in_flight == 0 &&
+ ((optype & CIFS_OP_MASK) != CIFS_NEG_OP) &&
+ ((optype & CIFS_OP_MASK) != CIFS_SESS_OP))
rc = change_conf(server);
/*
* Sometimes server returns 0 credits on oplock break ack - we need to
@@ -97,14 +101,26 @@ smb2_add_credits(struct TCP_Server_Info *server,
server->oplock_credits++;
}
}
+ scredits = *val;
+ in_flight = server->in_flight;
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
if (reconnect_detected) {
+ trace_smb3_reconnect_detected(server->CurrentMid,
+ server->conn_id, server->hostname, scredits, add, in_flight);
+
cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n",
add, instance);
}
+ if (reconnect_with_invalid_credits) {
+ trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
+ server->conn_id, server->hostname, scredits, add, in_flight);
+ cifs_dbg(FYI, "Negotiate operation when server credits is non-zero. Optype: %d, server credits: %d, credits added: %d\n",
+ optype, scredits, add);
+ }
+
if (server->tcpStatus == CifsNeedReconnect
|| server->tcpStatus == CifsExiting)
return;
@@ -123,23 +139,30 @@ smb2_add_credits(struct TCP_Server_Info *server,
cifs_dbg(FYI, "disabling oplocks\n");
break;
default:
- trace_smb3_add_credits(server->CurrentMid,
- server->hostname, rc, add);
- cifs_dbg(FYI, "%s: added %u credits total=%d\n", __func__, add, rc);
+ /* change_conf rebalanced credits for different types */
+ break;
}
+
+ trace_smb3_add_credits(server->CurrentMid,
+ server->conn_id, server->hostname, scredits, add, in_flight);
+ cifs_dbg(FYI, "%s: added %u credits total=%d\n", __func__, add, scredits);
}
static void
smb2_set_credits(struct TCP_Server_Info *server, const int val)
{
+ int scredits, in_flight;
+
spin_lock(&server->req_lock);
server->credits = val;
if (val == 1)
server->reconnect_instance++;
+ scredits = server->credits;
+ in_flight = server->in_flight;
spin_unlock(&server->req_lock);
trace_smb3_set_credits(server->CurrentMid,
- server->hostname, val, val);
+ server->conn_id, server->hostname, scredits, val, in_flight);
cifs_dbg(FYI, "%s: set %u credits\n", __func__, val);
/* don't log while holding the lock */
@@ -171,7 +194,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
unsigned int *num, struct cifs_credits *credits)
{
int rc = 0;
- unsigned int scredits;
+ unsigned int scredits, in_flight;
spin_lock(&server->req_lock);
while (1) {
@@ -208,17 +231,18 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
credits->instance = server->reconnect_instance;
server->credits -= credits->value;
- scredits = server->credits;
server->in_flight++;
if (server->in_flight > server->max_in_flight)
server->max_in_flight = server->in_flight;
break;
}
}
+ scredits = server->credits;
+ in_flight = server->in_flight;
spin_unlock(&server->req_lock);
trace_smb3_add_credits(server->CurrentMid,
- server->hostname, scredits, -(credits->value));
+ server->conn_id, server->hostname, scredits, -(credits->value), in_flight);
cifs_dbg(FYI, "%s: removed %u credits total=%d\n",
__func__, credits->value, scredits);
@@ -231,14 +255,14 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
const unsigned int payload_size)
{
int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE);
- int scredits;
+ int scredits, in_flight;
if (!credits->value || credits->value == new_val)
return 0;
if (credits->value < new_val) {
trace_smb3_too_many_credits(server->CurrentMid,
- server->hostname, 0, credits->value - new_val);
+ server->conn_id, server->hostname, 0, credits->value - new_val, 0);
cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)",
credits->value, new_val);
@@ -248,9 +272,13 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
spin_lock(&server->req_lock);
if (server->reconnect_instance != credits->instance) {
+ scredits = server->credits;
+ in_flight = server->in_flight;
spin_unlock(&server->req_lock);
+
trace_smb3_reconnect_detected(server->CurrentMid,
- server->hostname, 0, 0);
+ server->conn_id, server->hostname, scredits,
+ credits->value - new_val, in_flight);
cifs_server_dbg(VFS, "trying to return %d credits to old session\n",
credits->value - new_val);
return -EAGAIN;
@@ -258,15 +286,18 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
server->credits += credits->value - new_val;
scredits = server->credits;
+ in_flight = server->in_flight;
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- credits->value = new_val;
trace_smb3_add_credits(server->CurrentMid,
- server->hostname, scredits, credits->value - new_val);
+ server->conn_id, server->hostname, scredits,
+ credits->value - new_val, in_flight);
cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n",
__func__, credits->value - new_val, scredits);
+ credits->value = new_val;
+
return 0;
}
@@ -2369,7 +2400,7 @@ static bool
smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
- int scredits;
+ int scredits, in_flight;
if (shdr->Status != STATUS_PENDING)
return false;
@@ -2378,11 +2409,13 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
spin_lock(&server->req_lock);
server->credits += le16_to_cpu(shdr->CreditRequest);
scredits = server->credits;
+ in_flight = server->in_flight;
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
trace_smb3_add_credits(server->CurrentMid,
- server->hostname, scredits, le16_to_cpu(shdr->CreditRequest));
+ server->conn_id, server->hostname, scredits,
+ le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n",
__func__, le16_to_cpu(shdr->CreditRequest), scredits);
}
@@ -2418,6 +2451,34 @@ smb2_is_status_io_timeout(char *buf)
return false;
}
+static void
+smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
+{
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+ struct list_head *tmp, *tmp1;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ if (shdr->Status != STATUS_NETWORK_NAME_DELETED)
+ return;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each(tmp, &server->smb_ses_list) {
+ ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+ list_for_each(tmp1, &ses->tcon_list) {
+ tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ if (tcon->tid == shdr->TreeId) {
+ tcon->need_reconnect = true;
+ spin_unlock(&cifs_tcp_ses_lock);
+ pr_warn_once("Server share %s deleted.\n",
+ tcon->treeName);
+ return;
+ }
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+}
+
static int
smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
struct cifsInodeInfo *cinode)
@@ -4605,6 +4666,10 @@ static void smb2_decrypt_offload(struct work_struct *work)
#ifdef CONFIG_CIFS_STATS2
mid->when_received = jiffies;
#endif
+ if (dw->server->ops->is_network_name_deleted)
+ dw->server->ops->is_network_name_deleted(dw->buf,
+ dw->server);
+
mid->callback(mid);
} else {
spin_lock(&GlobalMid_Lock);
@@ -4723,6 +4788,12 @@ non_offloaded_decrypt:
rc = handle_read_data(server, *mid, buf,
server->vals->read_rsp_size,
pages, npages, len, false);
+ if (rc >= 0) {
+ if (server->ops->is_network_name_deleted) {
+ server->ops->is_network_name_deleted(buf,
+ server);
+ }
+ }
}
free_pages:
@@ -5072,6 +5143,7 @@ struct smb_version_operations smb20_operations = {
.fiemap = smb3_fiemap,
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
+ .is_network_name_deleted = smb2_is_network_name_deleted,
};
struct smb_version_operations smb21_operations = {
@@ -5173,6 +5245,7 @@ struct smb_version_operations smb21_operations = {
.fiemap = smb3_fiemap,
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
+ .is_network_name_deleted = smb2_is_network_name_deleted,
};
struct smb_version_operations smb30_operations = {
@@ -5286,6 +5359,7 @@ struct smb_version_operations smb30_operations = {
.fiemap = smb3_fiemap,
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
+ .is_network_name_deleted = smb2_is_network_name_deleted,
};
struct smb_version_operations smb311_operations = {
@@ -5399,6 +5473,7 @@ struct smb_version_operations smb311_operations = {
.fiemap = smb3_fiemap,
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
+ .is_network_name_deleted = smb2_is_network_name_deleted,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 794fc3b68b4f..4bbb6126b14d 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -814,8 +814,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
SMB3ANY_VERSION_STRING) == 0) {
req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
- req->DialectCount = cpu_to_le16(2);
- total_len += 4;
+ req->Dialects[2] = cpu_to_le16(SMB311_PROT_ID);
+ req->DialectCount = cpu_to_le16(3);
+ total_len += 6;
} else if (strcmp(server->vals->version_string,
SMBDEFAULT_VERSION_STRING) == 0) {
req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
@@ -849,6 +850,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
SMB2_CLIENT_GUID_SIZE);
if ((server->vals->protocol_id == SMB311_PROT_ID) ||
(strcmp(server->vals->version_string,
+ SMB3ANY_VERSION_STRING) == 0) ||
+ (strcmp(server->vals->version_string,
SMBDEFAULT_VERSION_STRING) == 0))
assemble_neg_contexts(req, server, &total_len);
}
@@ -883,6 +886,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
cifs_server_dbg(VFS,
"SMB2.1 dialect returned but not requested\n");
return -EIO;
+ } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
+ /* ops set to 3.0 by default for default so update */
+ server->ops = &smb311_operations;
+ server->vals = &smb311_values;
}
} else if (strcmp(server->vals->version_string,
SMBDEFAULT_VERSION_STRING) == 0) {
@@ -1042,10 +1049,11 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
SMB3ANY_VERSION_STRING) == 0) {
pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
- pneg_inbuf->DialectCount = cpu_to_le16(2);
- /* structure is big enough for 3 dialects, sending only 2 */
+ pneg_inbuf->Dialects[2] = cpu_to_le16(SMB311_PROT_ID);
+ pneg_inbuf->DialectCount = cpu_to_le16(3);
+ /* SMB 2.1 not included so subtract one dialect from len */
inbuflen = sizeof(*pneg_inbuf) -
- (2 * sizeof(pneg_inbuf->Dialects[0]));
+ (sizeof(pneg_inbuf->Dialects[0]));
} else if (strcmp(server->vals->version_string,
SMBDEFAULT_VERSION_STRING) == 0) {
pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID);
@@ -1053,7 +1061,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID);
pneg_inbuf->Dialects[3] = cpu_to_le16(SMB311_PROT_ID);
pneg_inbuf->DialectCount = cpu_to_le16(4);
- /* structure is big enough for 3 dialects */
+ /* structure is big enough for 4 dialects */
inbuflen = sizeof(*pneg_inbuf);
} else {
/* otherwise specific dialect was requested */
@@ -1253,7 +1261,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
cifs_ses_server(sess_data->ses),
&rqst,
&sess_data->buf0_type,
- CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov);
+ CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov);
cifs_small_buf_release(sess_data->iov[0].iov_base);
memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec));
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index c3d1a584f251..d6df908dccad 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -851,17 +851,21 @@ DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_PROTO(__u64 currmid,
+ __u64 conn_id,
char *hostname),
- TP_ARGS(currmid, hostname),
+ TP_ARGS(currmid, conn_id, hostname),
TP_STRUCT__entry(
__field(__u64, currmid)
+ __field(__u64, conn_id)
__field(char *, hostname)
),
TP_fast_assign(
__entry->currmid = currmid;
+ __entry->conn_id = conn_id;
__entry->hostname = hostname;
),
- TP_printk("server=%s current_mid=0x%llx",
+ TP_printk("conn_id=0x%llx server=%s current_mid=%llu",
+ __entry->conn_id,
__entry->hostname,
__entry->currmid)
)
@@ -869,44 +873,56 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class,
#define DEFINE_SMB3_RECONNECT_EVENT(name) \
DEFINE_EVENT(smb3_reconnect_class, smb3_##name, \
TP_PROTO(__u64 currmid, \
- char *hostname), \
- TP_ARGS(currmid, hostname))
+ __u64 conn_id, \
+ char *hostname), \
+ TP_ARGS(currmid, conn_id, hostname))
DEFINE_SMB3_RECONNECT_EVENT(reconnect);
DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect);
DECLARE_EVENT_CLASS(smb3_credit_class,
TP_PROTO(__u64 currmid,
+ __u64 conn_id,
char *hostname,
int credits,
- int credits_to_add),
- TP_ARGS(currmid, hostname, credits, credits_to_add),
+ int credits_to_add,
+ int in_flight),
+ TP_ARGS(currmid, conn_id, hostname, credits, credits_to_add, in_flight),
TP_STRUCT__entry(
__field(__u64, currmid)
+ __field(__u64, conn_id)
__field(char *, hostname)
__field(int, credits)
__field(int, credits_to_add)
+ __field(int, in_flight)
),
TP_fast_assign(
__entry->currmid = currmid;
+ __entry->conn_id = conn_id;
__entry->hostname = hostname;
__entry->credits = credits;
__entry->credits_to_add = credits_to_add;
+ __entry->in_flight = in_flight;
),
- TP_printk("server=%s current_mid=0x%llx credits=%d credits_to_add=%d",
+ TP_printk("conn_id=0x%llx server=%s current_mid=%llu "
+ "credits=%d credit_change=%d in_flight=%d",
+ __entry->conn_id,
__entry->hostname,
__entry->currmid,
__entry->credits,
- __entry->credits_to_add)
+ __entry->credits_to_add,
+ __entry->in_flight)
)
#define DEFINE_SMB3_CREDIT_EVENT(name) \
DEFINE_EVENT(smb3_credit_class, smb3_##name, \
TP_PROTO(__u64 currmid, \
+ __u64 conn_id, \
char *hostname, \
int credits, \
- int credits_to_add), \
- TP_ARGS(currmid, hostname, credits, credits_to_add))
+ int credits_to_add, \
+ int in_flight), \
+ TP_ARGS(currmid, conn_id, hostname, credits, credits_to_add, in_flight))
DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits);
DEFINE_SMB3_CREDIT_EVENT(reconnect_detected);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 4a2b836eb017..e90a1d1380b0 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -445,7 +445,7 @@ unmask:
*/
server->tcpStatus = CifsNeedReconnect;
trace_smb3_partial_send_reconnect(server->CurrentMid,
- server->hostname);
+ server->conn_id, server->hostname);
}
smbd_done:
if (rc < 0 && rc != -EINTR)
@@ -527,7 +527,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
int *credits;
int optype;
long int t;
- int scredits = server->credits;
+ int scredits, in_flight;
if (timeout < 0)
t = MAX_JIFFY_OFFSET;
@@ -551,23 +551,39 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
server->max_in_flight = server->in_flight;
*credits -= 1;
*instance = server->reconnect_instance;
+ scredits = *credits;
+ in_flight = server->in_flight;
spin_unlock(&server->req_lock);
+
+ trace_smb3_add_credits(server->CurrentMid,
+ server->conn_id, server->hostname, scredits, -1, in_flight);
+ cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
+ __func__, 1, scredits);
+
return 0;
}
while (1) {
if (*credits < num_credits) {
+ scredits = *credits;
spin_unlock(&server->req_lock);
+
cifs_num_waiters_inc(server);
rc = wait_event_killable_timeout(server->request_q,
has_credits(server, credits, num_credits), t);
cifs_num_waiters_dec(server);
if (!rc) {
+ spin_lock(&server->req_lock);
+ scredits = *credits;
+ in_flight = server->in_flight;
+ spin_unlock(&server->req_lock);
+
trace_smb3_credit_timeout(server->CurrentMid,
- server->hostname, num_credits, 0);
+ server->conn_id, server->hostname, scredits,
+ num_credits, in_flight);
cifs_server_dbg(VFS, "wait timed out after %d ms\n",
- timeout);
- return -ENOTSUPP;
+ timeout);
+ return -EBUSY;
}
if (rc == -ERESTARTSYS)
return -ERESTARTSYS;
@@ -595,6 +611,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
server->in_flight > 2 * MAX_COMPOUND &&
*credits <= MAX_COMPOUND) {
spin_unlock(&server->req_lock);
+
cifs_num_waiters_inc(server);
rc = wait_event_killable_timeout(
server->request_q,
@@ -603,13 +620,18 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
t);
cifs_num_waiters_dec(server);
if (!rc) {
+ spin_lock(&server->req_lock);
+ scredits = *credits;
+ in_flight = server->in_flight;
+ spin_unlock(&server->req_lock);
+
trace_smb3_credit_timeout(
- server->CurrentMid,
- server->hostname, num_credits,
- 0);
+ server->CurrentMid,
+ server->conn_id, server->hostname,
+ scredits, num_credits, in_flight);
cifs_server_dbg(VFS, "wait timed out after %d ms\n",
- timeout);
- return -ENOTSUPP;
+ timeout);
+ return -EBUSY;
}
if (rc == -ERESTARTSYS)
return -ERESTARTSYS;
@@ -625,16 +647,18 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
/* update # of requests on the wire to server */
if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) {
*credits -= num_credits;
- scredits = *credits;
server->in_flight += num_credits;
if (server->in_flight > server->max_in_flight)
server->max_in_flight = server->in_flight;
*instance = server->reconnect_instance;
}
+ scredits = *credits;
+ in_flight = server->in_flight;
spin_unlock(&server->req_lock);
trace_smb3_add_credits(server->CurrentMid,
- server->hostname, scredits, -(num_credits));
+ server->conn_id, server->hostname, scredits,
+ -(num_credits), in_flight);
cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
__func__, num_credits, scredits);
break;
@@ -656,13 +680,13 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num,
const int flags, unsigned int *instance)
{
int *credits;
- int scredits, sin_flight;
+ int scredits, in_flight;
credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK);
spin_lock(&server->req_lock);
scredits = *credits;
- sin_flight = server->in_flight;
+ in_flight = server->in_flight;
if (*credits < num) {
/*
@@ -684,10 +708,11 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num,
if (server->in_flight == 0) {
spin_unlock(&server->req_lock);
trace_smb3_insufficient_credits(server->CurrentMid,
- server->hostname, scredits, sin_flight);
+ server->conn_id, server->hostname, scredits,
+ num, in_flight);
cifs_dbg(FYI, "%s: %d requests in flight, needed %d total=%d\n",
- __func__, sin_flight, num, scredits);
- return -ENOTSUPP;
+ __func__, in_flight, num, scredits);
+ return -EDEADLK;
}
}
spin_unlock(&server->req_lock);
@@ -1171,7 +1196,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
/*
* Compounding is never used during session establish.
*/
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP))
+ if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP))
smb311_update_preauth_hash(ses, rqst[0].rq_iov,
rqst[0].rq_nvec);
@@ -1236,7 +1261,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
/*
* Compounding is never used during session establish.
*/
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) {
+ if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
struct kvec iov = {
.iov_base = resp_iov[0].iov_base,
.iov_len = resp_iov[0].iov_len
diff --git a/fs/coredump.c b/fs/coredump.c
index ae778937a1ff..1c0fdc1aa70b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -897,10 +897,10 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
*/
page = get_dump_page(addr);
if (page) {
- void *kaddr = kmap(page);
+ void *kaddr = kmap_local_page(page);
stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
- kunmap(page);
+ kunmap_local(kaddr);
put_page(page);
} else {
stop = !dump_skip(cprm, PAGE_SIZE);
diff --git a/fs/dcache.c b/fs/dcache.c
index 799d9e4f0bcd..7d24ff7eb206 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2176,8 +2176,8 @@ EXPORT_SYMBOL(d_obtain_root);
* same inode, only the actual correct case is stored in the dcache for
* case-insensitive filesystems.
*
- * For a case-insensitive lookup match and if the the case-exact dentry
- * already exists in in the dcache, use it and return it.
+ * For a case-insensitive lookup match and if the case-exact dentry
+ * already exists in the dcache, use it and return it.
*
* If no entry exists with the exact case name, allocate new dentry with
* the exact case, and return the spliced entry.
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c35249497b9b..22e86ae4dd5a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -298,7 +298,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
{
struct dentry *dentry;
- if (IS_ERR(parent))
+ if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent))
return NULL;
if (!parent)
@@ -319,6 +319,9 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!(debugfs_allow & DEBUGFS_ALLOW_API))
return ERR_PTR(-EPERM);
+ if (!debugfs_initialized())
+ return ERR_PTR(-ENOENT);
+
pr_debug("creating file '%s'\n", name);
if (IS_ERR(parent))
diff --git a/fs/direct-io.c b/fs/direct-io.c
index aa1083ecd623..b61491bf3166 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -462,7 +462,7 @@ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
* Wait for the next BIO to complete. Remove it and return it. NULL is
* returned once all BIOs have been completed. This must only be called once
* all bios have been issued so that dio->refcount can only decrease. This
- * requires that that the caller hold a reference on the dio.
+ * requires that the caller hold a reference on the dio.
*/
static struct bio *dio_await_one(struct dio *dio)
{
@@ -695,7 +695,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
if (ret)
goto out;
sector = start_sector << (sdio->blkbits - 9);
- nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES);
+ nr_pages = bio_max_segs(sdio->pages_in_io);
BUG_ON(nr_pages <= 0);
dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
sdio->boundary = 0;
@@ -1279,7 +1279,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
if (retval == -ENOTBLK) {
/*
* The remaining part of the request will be
- * be handled by buffered I/O when we return
+ * handled by buffered I/O when we return
*/
retval = 0;
}
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index ea4f693bee22..f88851c5c250 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -215,10 +215,8 @@ submit_bio_retry:
/* max # of continuous pages */
if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
- if (nblocks > BIO_MAX_PAGES)
- nblocks = BIO_MAX_PAGES;
- bio = bio_alloc(GFP_NOIO, nblocks);
+ bio = bio_alloc(GFP_NOIO, bio_max_segs(nblocks));
bio->bi_end_io = erofs_readendio;
bio_set_dev(bio, sb->s_bdev);
diff --git a/fs/exec.c b/fs/exec.c
index 6f3c02066ce3..18594f11c31f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1455,7 +1455,7 @@ EXPORT_SYMBOL(finalize_exec);
/*
* Prepare credentials and lock ->cred_guard_mutex.
* setup_new_exec() commits the new creds and drops the lock.
- * Or, if exec fails before, free_bprm() should release ->cred and
+ * Or, if exec fails before, free_bprm() should release ->cred
* and unlock.
*/
static int prepare_bprm_creds(struct linux_binprm *bprm)
@@ -1841,7 +1841,7 @@ static int bprm_execve(struct linux_binprm *bprm,
out:
/*
- * If past the point of no return ensure the the code never
+ * If past the point of no return ensure the code never
* returns to the userspace process. Use an existing fatal
* signal if present otherwise terminate the process with
* SIGSEGV.
diff --git a/fs/ext4/.kunitconfig b/fs/ext4/.kunitconfig
new file mode 100644
index 000000000000..bf51da7cd9fc
--- /dev/null
+++ b/fs/ext4/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_KUNIT_TESTS=y
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 619dd35ddd48..86699c8cab28 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -103,8 +103,7 @@ config EXT4_DEBUG
config EXT4_KUNIT_TESTS
tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS
- select EXT4_FS
- depends on KUNIT
+ depends on EXT4_FS && KUNIT
default KUNIT_ALL_TESTS
help
This builds the ext4 KUnit tests.
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3960b7ec3ab7..77c7c8a54da7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4382,8 +4382,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
{
struct inode *inode = file_inode(file);
handle_t *handle;
- int ret = 0;
- int ret2 = 0, ret3 = 0;
+ int ret, ret2 = 0, ret3 = 0;
int retries = 0;
int depth = 0;
struct ext4_map_blocks map;
@@ -4408,7 +4407,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
depth = ext_depth(inode);
retry:
- while (ret >= 0 && len) {
+ while (len) {
/*
* Recalculate credits when extent tree depth changes.
*/
@@ -4430,9 +4429,13 @@ retry:
inode->i_ino, map.m_lblk,
map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
+ ext4_journal_stop(handle);
break;
}
+ /*
+ * allow a full retry cycle for any remaining allocations
+ */
+ retries = 0;
map.m_lblk += ret;
map.m_len = len = len - ret;
epos = (loff_t)map.m_lblk << inode->i_blkbits;
@@ -4450,11 +4453,8 @@ retry:
if (unlikely(ret2))
break;
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries)) {
- ret = 0;
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
- }
return ret > 0 ? ret2 : ret;
}
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 6e8208acfc62..6c4f19b0a556 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -915,13 +915,11 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
struct super_block *sb = (struct super_block *)(journal->j_private);
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
- struct list_head *pos;
int ret = 0;
spin_lock(&sbi->s_fc_lock);
ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
- list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
- ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
while (atomic_read(&ei->i_fc_updates)) {
DEFINE_WAIT(wait);
@@ -978,17 +976,15 @@ __releases(&sbi->s_fc_lock)
{
struct super_block *sb = (struct super_block *)(journal->j_private);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_fc_dentry_update *fc_dentry;
+ struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
struct inode *inode;
- struct list_head *pos, *n, *fcd_pos, *fcd_n;
- struct ext4_inode_info *ei;
+ struct ext4_inode_info *ei, *ei_n;
int ret;
if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
return 0;
- list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
- fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
- fcd_list);
+ list_for_each_entry_safe(fc_dentry, fc_dentry_n,
+ &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
spin_unlock(&sbi->s_fc_lock);
if (!ext4_fc_add_dentry_tlv(
@@ -1004,8 +1000,8 @@ __releases(&sbi->s_fc_lock)
}
inode = NULL;
- list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
- ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
+ i_fc_list) {
if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
inode = &ei->vfs_inode;
break;
@@ -1057,7 +1053,6 @@ static int ext4_fc_perform_commit(journal_t *journal)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *iter;
struct ext4_fc_head head;
- struct list_head *pos;
struct inode *inode;
struct blk_plug plug;
int ret = 0;
@@ -1099,8 +1094,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
goto out;
}
- list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
- iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
inode = &iter->vfs_inode;
if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
continue;
@@ -1226,9 +1220,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *iter;
+ struct ext4_inode_info *iter, *iter_n;
struct ext4_fc_dentry_update *fc_dentry;
- struct list_head *pos, *n;
if (full && sbi->s_fc_bh)
sbi->s_fc_bh = NULL;
@@ -1236,8 +1229,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
jbd2_fc_release_bufs(journal);
spin_lock(&sbi->s_fc_lock);
- list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
- iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
+ i_fc_list) {
list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 877c602ae063..686bf982c84e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -731,6 +731,29 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
(space/bcount)*100/blocksize);
return (struct stats) { names, space, bcount};
}
+
+/*
+ * Linear search cross check
+ */
+static inline void htree_rep_invariant_check(struct dx_entry *at,
+ struct dx_entry *target,
+ u32 hash, unsigned int n)
+{
+ while (n--) {
+ dxtrace(printk(KERN_CONT ","));
+ if (dx_get_hash(++at) > hash) {
+ at--;
+ break;
+ }
+ }
+ ASSERT(at == target - 1);
+}
+#else /* DX_DEBUG */
+static inline void htree_rep_invariant_check(struct dx_entry *at,
+ struct dx_entry *target,
+ u32 hash, unsigned int n)
+{
+}
#endif /* DX_DEBUG */
/*
@@ -827,20 +850,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
p = m + 1;
}
- if (0) { // linear search cross check
- unsigned n = count - 1;
- at = entries;
- while (n--)
- {
- dxtrace(printk(KERN_CONT ","));
- if (dx_get_hash(++at) > hash)
- {
- at--;
- break;
- }
- }
- ASSERT(at == p - 1);
- }
+ htree_rep_invariant_check(entries, p, hash, count - 1);
at = p - 1;
dxtrace(printk(KERN_CONT " %x->%u\n",
@@ -2401,11 +2411,10 @@ again:
(frame - 1)->bh);
if (err)
goto journal_error;
- if (restart) {
- err = ext4_handle_dirty_dx_node(handle, dir,
- frame->bh);
+ err = ext4_handle_dirty_dx_node(handle, dir,
+ frame->bh);
+ if (err)
goto journal_error;
- }
} else {
struct dx_root *dxroot;
memcpy((char *) entries2, (char *) entries,
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index f014c5e473a9..3db923403505 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -371,8 +371,7 @@ int ext4_mpage_readpages(struct inode *inode,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_KERNEL,
- min_t(int, nr_pages, BIO_MAX_PAGES));
+ bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 802bd26ed01c..ad34a37278cd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,7 +59,7 @@
#include <trace/events/ext4.h>
static struct ext4_lazy_init *ext4_li_info;
-static struct mutex ext4_li_mtx;
+static DEFINE_MUTEX(ext4_li_mtx);
static struct ratelimit_state ext4_mount_msg_ratelimit;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
@@ -4875,7 +4875,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
- sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
sbi->s_journal->j_submit_inode_data_buffers =
ext4_journal_submit_inode_data_buffers;
sbi->s_journal->j_finish_inode_data_buffers =
@@ -4987,6 +4986,14 @@ no_journal:
goto failed_mount5;
}
+ /*
+ * We can only set up the journal commit callback once
+ * mballoc is initialized
+ */
+ if (sbi->s_journal)
+ sbi->s_journal->j_commit_callback =
+ ext4_journal_commit_callback;
+
block = ext4_count_free_clusters(sb);
ext4_free_blocks_count_set(sbi->s_es,
EXT4_C2B(sbi, block));
@@ -6667,7 +6674,6 @@ static int __init ext4_init_fs(void)
ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
ext4_li_info = NULL;
- mutex_init(&ext4_li_mtx);
/* Build-time check for flags consistency */
ext4_check_flag_values();
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b9721c8f116c..7c95818639a6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -969,8 +969,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
unsigned int post_read_steps = 0;
bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
- min_t(int, nr_pages, BIO_MAX_PAGES),
- &f2fs_bioset);
+ bio_max_segs(nr_pages), &f2fs_bioset);
if (!bio)
return ERR_PTR(-ENOMEM);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index a8a0fb890e8d..4b0e2e3c2c88 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2747,7 +2747,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
sum_entry = &sum->entries[0];
for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
- nrpages = min(last_offset - i, BIO_MAX_PAGES);
+ nrpages = bio_max_segs(last_offset - i);
/* readahead node pages */
f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true);
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 01263ffbc4c0..ec6feeccc276 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -173,7 +173,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
/*
* With handle we don't look at the execute bit on the
- * the directory. Ideally we would like CAP_DAC_SEARCH.
+ * directory. Ideally we would like CAP_DAC_SEARCH.
* But we don't have that
*/
if (!capable(CAP_DAC_READ_SEARCH)) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 588f8d1240aa..c6636b4c4ccf 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -844,11 +844,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (WARN_ON(PageMlocked(oldpage)))
goto out_fallback_unlock;
- err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
- if (err) {
- unlock_page(newpage);
- goto out_put_old;
- }
+ replace_page_cache_page(oldpage, newpage);
get_page(newpage);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 68376fa03880..c9775d5c6594 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -491,8 +491,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
di = (struct gfs2_dinode *)dibh->b_data;
gfs2_dinode_out(ip, di);
- di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev));
- di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev));
+ di->di_major = cpu_to_be32(imajor(&ip->i_inode));
+ di->di_minor = cpu_to_be32(iminor(&ip->i_inode));
di->__pad1 = 0;
di->__pad2 = 0;
di->__pad3 = 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b7a72f577aab..701c82c36138 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -171,7 +171,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
ret = -ENOMEM;
- if (hugetlb_reserve_pages(inode,
+ if (!hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
vma->vm_flags))
@@ -310,7 +310,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
/*
* Support for read() - Find the page attached to f_mapping and copy out the
- * data. Its *very* similar to do_generic_mapping_read(), we can't use that
+ * data. Its *very* similar to generic_file_buffered_read(), we can't use that
* since it has PAGE_SIZE assumptions.
*/
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -442,15 +442,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
*
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
- * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
* maps and global counts. Page faults can not race with truncation
* in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents
* page faults in the truncated range by checking i_size. i_size is
* modified while holding i_mmap_rwsem.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
- * Only when releasing a page is the associated region/reserv map
- * deleted. The region/reserv map for ranges without associated
+ * Only when releasing a page is the associated region/reserve map
+ * deleted. The region/reserve map for ranges without associated
* pages are not modified. Page faults can race with hole punch.
* This is indicated if we find a mapped page.
* Note: If the passed end of range value is beyond the end of file, but
@@ -567,7 +567,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
+static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
struct address_space *mapping = inode->i_mapping;
@@ -582,7 +582,6 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
- return 0;
}
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
@@ -604,7 +603,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
inode_unlock(inode);
return -EPERM;
@@ -680,7 +679,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
*/
struct page *page;
unsigned long addr;
- int avoid_reserve = 0;
cond_resched();
@@ -716,8 +714,15 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
continue;
}
- /* Allocate page and add to page cache */
- page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ /*
+ * Allocate page without setting the avoid_reserve argument.
+ * There certainly are no reserves associated with the
+ * pseudo_vma. However, there could be shared mappings with
+ * reserves for the file at the inode level. If we fallocate
+ * pages in these areas, we need to consume the reserves
+ * to keep reservation accounting consistent.
+ */
+ page = alloc_huge_page(&pseudo_vma, addr, 0);
hugetlb_drop_vma_policy(&pseudo_vma);
if (IS_ERR(page)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -735,7 +740,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- set_page_huge_active(page);
+ SetHPageMigratable(page);
/*
* unlock_page because locked by add_to_page_cache()
* put_page() due to reference from alloc_huge_page()
@@ -761,8 +766,6 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
unsigned int ia_valid = attr->ia_valid;
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
- BUG_ON(!inode);
-
error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
return error;
@@ -773,13 +776,11 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
if (newsize & ~huge_page_mask(h))
return -EINVAL;
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
- error = hugetlb_vmtruncate(inode, newsize);
- if (error)
- return error;
+ hugetlb_vmtruncate(inode, newsize);
}
setattr_copy(&init_user_ns, inode, attr);
@@ -952,17 +953,6 @@ static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
return error;
}
-/*
- * mark the head page dirty
- */
-static int hugetlbfs_set_page_dirty(struct page *page)
-{
- struct page *head = compound_head(page);
-
- SetPageDirty(head);
- return 0;
-}
-
static int hugetlbfs_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
@@ -973,15 +963,9 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- /*
- * page_private is subpool pointer in hugetlb pages. Transfer to
- * new page. PagePrivate is not associated with page_private for
- * hugetlb pages and can not be set here as only page_huge_active
- * pages can be migrated.
- */
- if (page_private(page)) {
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
+ if (hugetlb_page_subpool(page)) {
+ hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page));
+ hugetlb_set_page_subpool(page, NULL);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1156,7 +1140,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
- .set_page_dirty = hugetlbfs_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
.migratepage = hugetlbfs_migrate_page,
.error_remove_page = hugetlbfs_error_remove_page,
};
@@ -1356,7 +1340,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
/*
* Allocate and initialize subpool if maximum or minimum size is
- * specified. Any needed reservations (for minimim size) are taken
+ * specified. Any needed reservations (for minimum size) are taken
* taken when the subpool is created.
*/
if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
@@ -1499,7 +1483,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
inode->i_size = size;
clear_nlink(inode);
- if (hugetlb_reserve_pages(inode, 0,
+ if (!hugetlb_reserve_pages(inode, 0,
size >> huge_page_shift(hstate_inode(inode)), NULL,
acctflag))
file = ERR_PTR(-ENOMEM);
@@ -1533,8 +1517,8 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
put_fs_context(fc);
}
if (IS_ERR(mnt))
- pr_err("Cannot mount internal hugetlbfs for page size %uK",
- 1U << (h->order + PAGE_SHIFT - 10));
+ pr_err("Cannot mount internal hugetlbfs for page size %luK",
+ huge_page_size(h) >> 10);
return mnt;
}
@@ -1562,7 +1546,7 @@ static int __init init_hugetlbfs_fs(void)
goto out_free;
/* default hstate mount is required */
- mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
+ mnt = mount_one_hugetlbfs(&default_hstate);
if (IS_ERR(mnt)) {
error = PTR_ERR(mnt);
goto out_unreg;
diff --git a/fs/inode.c b/fs/inode.c
index 6dba963d3f6d..a047ab306f9a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -142,6 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &no_open_fops;
+ inode->i_ino = 0;
inode->__i_nlink = 1;
inode->i_opflags = 0;
if (sb->s_xattr)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index c36bbcd823ce..28868eb4cd09 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -13,13 +13,10 @@
#include <linux/sched/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
-#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
-#include <linux/fs_struct.h>
-#include <linux/task_work.h>
-#include <linux/blk-cgroup.h>
-#include <linux/audit.h>
#include <linux/cpu.h>
+#include <linux/tracehook.h>
+#include <linux/freezer.h>
#include "../kernel/sched/sched.h"
#include "io-wq.h"
@@ -36,7 +33,6 @@ enum {
enum {
IO_WQ_BIT_EXIT = 0, /* wq exiting */
- IO_WQ_BIT_ERROR = 1, /* error on setup */
};
enum {
@@ -57,14 +53,9 @@ struct io_worker {
struct io_wq_work *cur_work;
spinlock_t lock;
+ struct completion ref_done;
+
struct rcu_head rcu;
- struct mm_struct *mm;
-#ifdef CONFIG_BLK_CGROUP
- struct cgroup_subsys_state *blkcg_css;
-#endif
- const struct cred *cur_creds;
- const struct cred *saved_creds;
- struct nsproxy *restore_nsproxy;
};
#if BITS_PER_LONG == 64
@@ -93,7 +84,6 @@ struct io_wqe {
struct {
raw_spinlock_t lock;
struct io_wq_work_list work_list;
- unsigned long hash_map;
unsigned flags;
} ____cacheline_aligned_in_smp;
@@ -103,6 +93,8 @@ struct io_wqe {
struct hlist_nulls_head free_list;
struct list_head all_list;
+ struct wait_queue_entry wait;
+
struct io_wq *wq;
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
};
@@ -119,16 +111,33 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
+
+ struct io_wq_hash *hash;
+
refcount_t refs;
- struct completion done;
+ struct completion exited;
+
+ atomic_t worker_refs;
+ struct completion worker_done;
struct hlist_node cpuhp_node;
- refcount_t use_refs;
+ pid_t task_pid;
};
static enum cpuhp_state io_wq_online;
+struct io_cb_cancel_data {
+ work_cancel_fn *fn;
+ void *data;
+ int nr_running;
+ int nr_pending;
+ bool cancel_all;
+};
+
+static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
+ struct io_cb_cancel_data *match);
+
static bool io_worker_get(struct io_worker *worker)
{
return refcount_inc_not_zero(&worker->ref);
@@ -137,62 +146,7 @@ static bool io_worker_get(struct io_worker *worker)
static void io_worker_release(struct io_worker *worker)
{
if (refcount_dec_and_test(&worker->ref))
- wake_up_process(worker->task);
-}
-
-/*
- * Note: drops the wqe->lock if returning true! The caller must re-acquire
- * the lock in that case. Some callers need to restart handling if this
- * happens, so we can't just re-acquire the lock on behalf of the caller.
- */
-static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
-{
- bool dropped_lock = false;
-
- if (worker->saved_creds) {
- revert_creds(worker->saved_creds);
- worker->cur_creds = worker->saved_creds = NULL;
- }
-
- if (current->files) {
- __acquire(&wqe->lock);
- raw_spin_unlock_irq(&wqe->lock);
- dropped_lock = true;
-
- task_lock(current);
- current->files = NULL;
- current->nsproxy = worker->restore_nsproxy;
- task_unlock(current);
- }
-
- if (current->fs)
- current->fs = NULL;
-
- /*
- * If we have an active mm, we need to drop the wq lock before unusing
- * it. If we do, return true and let the caller retry the idle loop.
- */
- if (worker->mm) {
- if (!dropped_lock) {
- __acquire(&wqe->lock);
- raw_spin_unlock_irq(&wqe->lock);
- dropped_lock = true;
- }
- __set_current_state(TASK_RUNNING);
- kthread_unuse_mm(worker->mm);
- mmput(worker->mm);
- worker->mm = NULL;
- }
-
-#ifdef CONFIG_BLK_CGROUP
- if (worker->blkcg_css) {
- kthread_associate_blkcg(NULL);
- worker->blkcg_css = NULL;
- }
-#endif
- if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- return dropped_lock;
+ complete(&worker->ref_done);
}
static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
@@ -204,9 +158,10 @@ static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
return &wqe->acct[IO_WQ_ACCT_BOUND];
}
-static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
- struct io_worker *worker)
+static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
{
+ struct io_wqe *wqe = worker->wqe;
+
if (worker->flags & IO_WORKER_F_BOUND)
return &wqe->acct[IO_WQ_ACCT_BOUND];
@@ -216,39 +171,33 @@ static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
static void io_worker_exit(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
- struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ unsigned flags;
- /*
- * If we're not at zero, someone else is holding a brief reference
- * to the worker. Wait for that to go away.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- if (!refcount_dec_and_test(&worker->ref))
- schedule();
- __set_current_state(TASK_RUNNING);
+ if (refcount_dec_and_test(&worker->ref))
+ complete(&worker->ref_done);
+ wait_for_completion(&worker->ref_done);
preempt_disable();
current->flags &= ~PF_IO_WORKER;
- if (worker->flags & IO_WORKER_F_RUNNING)
+ flags = worker->flags;
+ worker->flags = 0;
+ if (flags & IO_WORKER_F_RUNNING)
atomic_dec(&acct->nr_running);
- if (!(worker->flags & IO_WORKER_F_BOUND))
- atomic_dec(&wqe->wq->user->processes);
worker->flags = 0;
preempt_enable();
raw_spin_lock_irq(&wqe->lock);
- hlist_nulls_del_rcu(&worker->nulls_node);
+ if (flags & IO_WORKER_F_FREE)
+ hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
- if (__io_worker_unuse(wqe, worker)) {
- __release(&wqe->lock);
- raw_spin_lock_irq(&wqe->lock);
- }
acct->nr_workers--;
raw_spin_unlock_irq(&wqe->lock);
kfree_rcu(worker, rcu);
- if (refcount_dec_and_test(&wqe->wq->refs))
- complete(&wqe->wq->done);
+ if (atomic_dec_and_test(&wqe->wq->worker_refs))
+ complete(&wqe->wq->worker_done);
+ do_exit(0);
}
static inline bool io_wqe_run_queue(struct io_wqe *wqe)
@@ -306,35 +255,23 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
wake_up_process(wqe->wq->manager);
}
-static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker)
+static void io_wqe_inc_running(struct io_worker *worker)
{
- struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
atomic_inc(&acct->nr_running);
}
-static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker)
+static void io_wqe_dec_running(struct io_worker *worker)
__must_hold(wqe->lock)
{
- struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wqe *wqe = worker->wqe;
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
io_wqe_wake_worker(wqe, acct);
}
-static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
-{
- allow_kernel_signal(SIGINT);
-
- current->flags |= PF_IO_WORKER;
- current->fs = NULL;
- current->files = NULL;
-
- worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
- worker->restore_nsproxy = current->nsproxy;
- io_wqe_inc_running(wqe, worker);
-}
-
/*
* Worker will start processing some work. Move it to the busy list, if
* it's currently on the freelist
@@ -357,19 +294,17 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
if (worker_bound != work_bound) {
- io_wqe_dec_running(wqe, worker);
+ io_wqe_dec_running(worker);
if (work_bound) {
worker->flags |= IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
- atomic_dec(&wqe->wq->user->processes);
} else {
worker->flags &= ~IO_WORKER_F_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
- atomic_inc(&wqe->wq->user->processes);
}
- io_wqe_inc_running(wqe, worker);
+ io_wqe_inc_running(worker);
}
}
@@ -380,15 +315,13 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
* retry the loop in that case (we changed task state), we don't regrab
* the lock if we return success.
*/
-static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
+static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
__must_hold(wqe->lock)
{
if (!(worker->flags & IO_WORKER_F_FREE)) {
worker->flags |= IO_WORKER_F_FREE;
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
}
-
- return __io_worker_unuse(wqe, worker);
}
static inline unsigned int io_get_work_hash(struct io_wq_work *work)
@@ -396,14 +329,31 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
return work->flags >> IO_WQ_HASH_SHIFT;
}
+static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
+{
+ struct io_wq *wq = wqe->wq;
+
+ spin_lock(&wq->hash->wait.lock);
+ if (list_empty(&wqe->wait.entry)) {
+ __add_wait_queue(&wq->hash->wait, &wqe->wait);
+ if (!test_bit(hash, &wq->hash->map)) {
+ __set_current_state(TASK_RUNNING);
+ list_del_init(&wqe->wait.entry);
+ }
+ }
+ spin_unlock(&wq->hash->wait.lock);
+}
+
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
- unsigned int hash;
+ unsigned int stall_hash = -1U;
wq_list_for_each(node, prev, &wqe->work_list) {
+ unsigned int hash;
+
work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */
@@ -412,111 +362,48 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
return work;
}
- /* hashed, can run if not already running */
hash = io_get_work_hash(work);
- if (!(wqe->hash_map & BIT(hash))) {
- wqe->hash_map |= BIT(hash);
- /* all items with this hash lie in [work, tail] */
- tail = wqe->hash_tail[hash];
+ /* all items with this hash lie in [work, tail] */
+ tail = wqe->hash_tail[hash];
+
+ /* hashed, can run if not already running */
+ if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
wqe->hash_tail[hash] = NULL;
wq_list_cut(&wqe->work_list, &tail->list, prev);
return work;
}
+ if (stall_hash == -1U)
+ stall_hash = hash;
+ /* fast forward to a next hash, for-each will fix up @prev */
+ node = &tail->list;
}
- return NULL;
-}
-
-static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
-{
- if (worker->mm) {
- kthread_unuse_mm(worker->mm);
- mmput(worker->mm);
- worker->mm = NULL;
- }
-
- if (mmget_not_zero(work->identity->mm)) {
- kthread_use_mm(work->identity->mm);
- worker->mm = work->identity->mm;
- return;
- }
-
- /* failed grabbing mm, ensure work gets cancelled */
- work->flags |= IO_WQ_WORK_CANCEL;
-}
-
-static inline void io_wq_switch_blkcg(struct io_worker *worker,
- struct io_wq_work *work)
-{
-#ifdef CONFIG_BLK_CGROUP
- if (!(work->flags & IO_WQ_WORK_BLKCG))
- return;
- if (work->identity->blkcg_css != worker->blkcg_css) {
- kthread_associate_blkcg(work->identity->blkcg_css);
- worker->blkcg_css = work->identity->blkcg_css;
+ if (stall_hash != -1U) {
+ raw_spin_unlock(&wqe->lock);
+ io_wait_on_hash(wqe, stall_hash);
+ raw_spin_lock(&wqe->lock);
}
-#endif
-}
-
-static void io_wq_switch_creds(struct io_worker *worker,
- struct io_wq_work *work)
-{
- const struct cred *old_creds = override_creds(work->identity->creds);
- worker->cur_creds = work->identity->creds;
- if (worker->saved_creds)
- put_cred(old_creds); /* creds set by previous switch */
- else
- worker->saved_creds = old_creds;
+ return NULL;
}
-static void io_impersonate_work(struct io_worker *worker,
- struct io_wq_work *work)
+static void io_flush_signals(void)
{
- if ((work->flags & IO_WQ_WORK_FILES) &&
- current->files != work->identity->files) {
- task_lock(current);
- current->files = work->identity->files;
- current->nsproxy = work->identity->nsproxy;
- task_unlock(current);
- if (!work->identity->files) {
- /* failed grabbing files, ensure work gets cancelled */
- work->flags |= IO_WQ_WORK_CANCEL;
- }
+ if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) {
+ if (current->task_works)
+ task_work_run();
+ clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL);
}
- if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
- current->fs = work->identity->fs;
- if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
- io_wq_switch_mm(worker, work);
- if ((work->flags & IO_WQ_WORK_CREDS) &&
- worker->cur_creds != work->identity->creds)
- io_wq_switch_creds(worker, work);
- if (work->flags & IO_WQ_WORK_FSIZE)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
- else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- io_wq_switch_blkcg(worker, work);
-#ifdef CONFIG_AUDIT
- current->loginuid = work->identity->loginuid;
- current->sessionid = work->identity->sessionid;
-#endif
}
static void io_assign_current_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work) {
- /* flush pending signals before assigning new work */
- if (signal_pending(current))
- flush_signals(current);
+ io_flush_signals();
cond_resched();
}
-#ifdef CONFIG_AUDIT
- current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
- current->sessionid = AUDIT_SID_UNSET;
-#endif
-
spin_lock_irq(&worker->lock);
worker->cur_work = work;
spin_unlock_irq(&worker->lock);
@@ -550,6 +437,7 @@ get_next:
if (!work)
break;
io_assign_current_work(worker, work);
+ __set_current_state(TASK_RUNNING);
/* handle a whole dependent link */
do {
@@ -557,7 +445,6 @@ get_next:
unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work);
- io_impersonate_work(worker, work);
wq->do_work(work);
io_assign_current_work(worker, NULL);
@@ -572,8 +459,10 @@ get_next:
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
+ clear_bit(hash, &wq->hash->map);
+ if (wq_has_sleeper(&wq->hash->wait))
+ wake_up(&wq->hash->wait);
raw_spin_lock_irq(&wqe->lock);
- wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
@@ -591,28 +480,29 @@ static int io_wqe_worker(void *data)
struct io_worker *worker = data;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
+ char buf[TASK_COMM_LEN];
+
+ worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+ io_wqe_inc_running(worker);
- io_worker_start(wqe, worker);
+ sprintf(buf, "iou-wrk-%d", wq->task_pid);
+ set_task_comm(current, buf);
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
set_current_state(TASK_INTERRUPTIBLE);
loop:
raw_spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
- __set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
goto loop;
}
- /* drops the lock on success, retry */
- if (__io_worker_idle(wqe, worker)) {
- __release(&wqe->lock);
- goto loop;
- }
+ __io_worker_idle(wqe, worker);
raw_spin_unlock_irq(&wqe->lock);
- if (signal_pending(current))
- flush_signals(current);
+ io_flush_signals();
if (schedule_timeout(WORKER_IDLE_TIMEOUT))
continue;
+ if (fatal_signal_pending(current))
+ break;
/* timed out, exit unless we're the fixed worker */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
!(worker->flags & IO_WORKER_F_FIXED))
@@ -636,15 +526,16 @@ loop:
*/
void io_wq_worker_running(struct task_struct *tsk)
{
- struct io_worker *worker = kthread_data(tsk);
- struct io_wqe *wqe = worker->wqe;
+ struct io_worker *worker = tsk->pf_io_worker;
+ if (!worker)
+ return;
if (!(worker->flags & IO_WORKER_F_UP))
return;
if (worker->flags & IO_WORKER_F_RUNNING)
return;
worker->flags |= IO_WORKER_F_RUNNING;
- io_wqe_inc_running(wqe, worker);
+ io_wqe_inc_running(worker);
}
/*
@@ -654,9 +545,10 @@ void io_wq_worker_running(struct task_struct *tsk)
*/
void io_wq_worker_sleeping(struct task_struct *tsk)
{
- struct io_worker *worker = kthread_data(tsk);
- struct io_wqe *wqe = worker->wqe;
+ struct io_worker *worker = tsk->pf_io_worker;
+ if (!worker)
+ return;
if (!(worker->flags & IO_WORKER_F_UP))
return;
if (!(worker->flags & IO_WORKER_F_RUNNING))
@@ -664,15 +556,18 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
worker->flags &= ~IO_WORKER_F_RUNNING;
- raw_spin_lock_irq(&wqe->lock);
- io_wqe_dec_running(wqe, worker);
- raw_spin_unlock_irq(&wqe->lock);
+ raw_spin_lock_irq(&worker->wqe->lock);
+ io_wqe_dec_running(worker);
+ raw_spin_unlock_irq(&worker->wqe->lock);
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker;
+ struct task_struct *tsk;
+
+ __set_current_state(TASK_RUNNING);
worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
if (!worker)
@@ -682,14 +577,22 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
worker->nulls_node.pprev = NULL;
worker->wqe = wqe;
spin_lock_init(&worker->lock);
+ init_completion(&worker->ref_done);
+
+ atomic_inc(&wq->worker_refs);
- worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
- "io_wqe_worker-%d/%d", index, wqe->node);
- if (IS_ERR(worker->task)) {
+ tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
+ if (IS_ERR(tsk)) {
+ if (atomic_dec_and_test(&wq->worker_refs))
+ complete(&wq->worker_done);
kfree(worker);
return false;
}
- kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
+
+ tsk->pf_io_worker = worker;
+ worker->task = tsk;
+ set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node));
+ tsk->flags |= PF_NOFREEZE | PF_NO_SETAFFINITY;
raw_spin_lock_irq(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
@@ -701,12 +604,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
worker->flags |= IO_WORKER_F_FIXED;
acct->nr_workers++;
raw_spin_unlock_irq(&wqe->lock);
-
- if (index == IO_WQ_ACCT_UNBOUND)
- atomic_inc(&wq->user->processes);
-
- refcount_inc(&wq->refs);
- wake_up_process(worker->task);
+ wake_up_new_task(tsk);
return true;
}
@@ -715,6 +613,8 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
{
struct io_wqe_acct *acct = &wqe->acct[index];
+ if (acct->nr_workers && test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state))
+ return false;
/* if we have available workers or no work, no need */
if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
return false;
@@ -748,97 +648,92 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
+ set_notify_signal(worker->task);
wake_up_process(worker->task);
return false;
}
-/*
- * Manager thread. Tasked with creating new workers, if we need them.
- */
-static int io_wq_manager(void *data)
+static void io_wq_check_workers(struct io_wq *wq)
{
- struct io_wq *wq = data;
int node;
- /* create fixed workers */
- refcount_set(&wq->refs, 1);
for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+ bool fork_worker[2] = { false, false };
+
if (!node_online(node))
continue;
- if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
- continue;
- set_bit(IO_WQ_BIT_ERROR, &wq->state);
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
- goto out;
+
+ raw_spin_lock_irq(&wqe->lock);
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
+ fork_worker[IO_WQ_ACCT_BOUND] = true;
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
+ fork_worker[IO_WQ_ACCT_UNBOUND] = true;
+ raw_spin_unlock_irq(&wqe->lock);
+ if (fork_worker[IO_WQ_ACCT_BOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
+ if (fork_worker[IO_WQ_ACCT_UNBOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
}
+}
- complete(&wq->done);
+static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
+{
+ return true;
+}
- while (!kthread_should_stop()) {
- if (current->task_works)
- task_work_run();
+static void io_wq_cancel_pending(struct io_wq *wq)
+{
+ struct io_cb_cancel_data match = {
+ .fn = io_wq_work_match_all,
+ .cancel_all = true,
+ };
+ int node;
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
- bool fork_worker[2] = { false, false };
+ for_each_node(node)
+ io_wqe_cancel_pending_work(wq->wqes[node], &match);
+}
+
+/*
+ * Manager thread. Tasked with creating new workers, if we need them.
+ */
+static int io_wq_manager(void *data)
+{
+ struct io_wq *wq = data;
+ char buf[TASK_COMM_LEN];
+ int node;
- if (!node_online(node))
- continue;
+ sprintf(buf, "iou-mgr-%d", wq->task_pid);
+ set_task_comm(current, buf);
- raw_spin_lock_irq(&wqe->lock);
- if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
- fork_worker[IO_WQ_ACCT_BOUND] = true;
- if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
- fork_worker[IO_WQ_ACCT_UNBOUND] = true;
- raw_spin_unlock_irq(&wqe->lock);
- if (fork_worker[IO_WQ_ACCT_BOUND])
- create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
- if (fork_worker[IO_WQ_ACCT_UNBOUND])
- create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
- }
+ do {
set_current_state(TASK_INTERRUPTIBLE);
+ io_wq_check_workers(wq);
schedule_timeout(HZ);
- }
-
- if (current->task_works)
- task_work_run();
+ try_to_freeze();
+ if (fatal_signal_pending(current))
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state));
-out:
- if (refcount_dec_and_test(&wq->refs)) {
- complete(&wq->done);
- return 0;
- }
- /* if ERROR is set and we get here, we have workers to wake */
- if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
- rcu_read_lock();
- for_each_node(node)
- io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
- rcu_read_unlock();
- }
- return 0;
-}
-
-static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
- struct io_wq_work *work)
-{
- bool free_worker;
-
- if (!(work->flags & IO_WQ_WORK_UNBOUND))
- return true;
- if (atomic_read(&acct->nr_running))
- return true;
+ io_wq_check_workers(wq);
rcu_read_lock();
- free_worker = !hlist_nulls_empty(&wqe->free_list);
+ for_each_node(node)
+ io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
rcu_read_unlock();
- if (free_worker)
- return true;
- if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
- !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
- return false;
+ /* we might not ever have created any workers */
+ if (atomic_read(&wq->worker_refs))
+ wait_for_completion(&wq->worker_done);
- return true;
+ spin_lock_irq(&wq->hash->wait.lock);
+ for_each_node(node)
+ list_del_init(&wq->wqes[node]->wait.entry);
+ spin_unlock_irq(&wq->hash->wait.lock);
+
+ io_wq_cancel_pending(wq);
+ complete(&wq->exited);
+ do_exit(0);
}
static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
@@ -872,20 +767,35 @@ append:
wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
}
+static int io_wq_fork_manager(struct io_wq *wq)
+{
+ struct task_struct *tsk;
+
+ if (wq->manager)
+ return 0;
+
+ reinit_completion(&wq->worker_done);
+ tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE);
+ if (!IS_ERR(tsk)) {
+ wq->manager = get_task_struct(tsk);
+ wake_up_new_task(tsk);
+ return 0;
+ }
+
+ return PTR_ERR(tsk);
+}
+
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
int work_flags;
unsigned long flags;
- /*
- * Do early check to see if we need a new unbound worker, and if we do,
- * if we're allowed to do so. This isn't 100% accurate as there's a
- * gap between this check and incrementing the value, but that's OK.
- * It's close enough to not be an issue, fork() has the same delay.
- */
- if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
- io_run_cancel(work, wqe);
+ /* Can only happen if manager creation fails after exec */
+ if (io_wq_fork_manager(wqe->wq) ||
+ test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ wqe->wq->do_work(work);
return;
}
@@ -919,14 +829,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
}
-struct io_cb_cancel_data {
- work_cancel_fn *fn;
- void *data;
- int nr_running;
- int nr_pending;
- bool cancel_all;
-};
-
static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
struct io_cb_cancel_data *match = data;
@@ -939,7 +841,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
match->fn(worker->cur_work, match->data)) {
- send_sig(SIGINT, worker->task, 1);
+ set_notify_signal(worker->task);
match->nr_running++;
}
spin_unlock_irqrestore(&worker->lock, flags);
@@ -1043,6 +945,24 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
return IO_WQ_CANCEL_NOTFOUND;
}
+static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
+ int ret;
+
+ list_del_init(&wait->entry);
+
+ rcu_read_lock();
+ ret = io_wqe_activate_free_worker(wqe);
+ rcu_read_unlock();
+
+ if (!ret)
+ wake_up_process(wqe->wq->manager);
+
+ return 1;
+}
+
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
int ret = -ENOMEM, node;
@@ -1063,12 +983,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
if (ret)
goto err_wqes;
+ refcount_inc(&data->hash->refs);
+ wq->hash = data->hash;
wq->free_work = data->free_work;
wq->do_work = data->do_work;
- /* caller must already hold a reference to this */
- wq->user = data->user;
-
ret = -ENOMEM;
for_each_node(node) {
struct io_wqe *wqe;
@@ -1083,11 +1002,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
wqe->node = alloc_node;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
- if (wq->user) {
- wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
+ wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
- }
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
+ wqe->wait.func = io_wqe_hash_wake;
+ INIT_LIST_HEAD(&wqe->wait.entry);
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
@@ -1095,24 +1014,19 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
INIT_LIST_HEAD(&wqe->all_list);
}
- init_completion(&wq->done);
+ wq->task_pid = current->pid;
+ init_completion(&wq->exited);
+ refcount_set(&wq->refs, 1);
- wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
- if (!IS_ERR(wq->manager)) {
- wake_up_process(wq->manager);
- wait_for_completion(&wq->done);
- if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
- ret = -ENOMEM;
- goto err;
- }
- refcount_set(&wq->use_refs, 1);
- reinit_completion(&wq->done);
+ init_completion(&wq->worker_done);
+ atomic_set(&wq->worker_refs, 0);
+
+ ret = io_wq_fork_manager(wq);
+ if (!ret)
return wq;
- }
- ret = PTR_ERR(wq->manager);
- complete(&wq->done);
err:
+ io_wq_put_hash(data->hash);
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node)
kfree(wq->wqes[node]);
@@ -1123,46 +1037,46 @@ err_wq:
return ERR_PTR(ret);
}
-bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
+static void io_wq_destroy_manager(struct io_wq *wq)
{
- if (data->free_work != wq->free_work || data->do_work != wq->do_work)
- return false;
-
- return refcount_inc_not_zero(&wq->use_refs);
+ if (wq->manager) {
+ wake_up_process(wq->manager);
+ wait_for_completion(&wq->exited);
+ put_task_struct(wq->manager);
+ wq->manager = NULL;
+ }
}
-static void __io_wq_destroy(struct io_wq *wq)
+static void io_wq_destroy(struct io_wq *wq)
{
int node;
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
set_bit(IO_WQ_BIT_EXIT, &wq->state);
- if (wq->manager)
- kthread_stop(wq->manager);
-
- rcu_read_lock();
- for_each_node(node)
- io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
- rcu_read_unlock();
+ io_wq_destroy_manager(wq);
- wait_for_completion(&wq->done);
-
- for_each_node(node)
- kfree(wq->wqes[node]);
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+ WARN_ON_ONCE(!wq_list_empty(&wqe->work_list));
+ kfree(wqe);
+ }
+ io_wq_put_hash(wq->hash);
kfree(wq->wqes);
kfree(wq);
}
-void io_wq_destroy(struct io_wq *wq)
+void io_wq_put(struct io_wq *wq)
{
- if (refcount_dec_and_test(&wq->use_refs))
- __io_wq_destroy(wq);
+ if (refcount_dec_and_test(&wq->refs))
+ io_wq_destroy(wq);
}
-struct task_struct *io_wq_get_task(struct io_wq *wq)
+void io_wq_put_and_exit(struct io_wq *wq)
{
- return wq->manager;
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ io_wq_destroy_manager(wq);
+ io_wq_put(wq);
}
static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 096f1021018e..5fbf7997149e 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -1,6 +1,7 @@
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
+#include <linux/refcount.h>
#include <linux/io_uring.h>
struct io_wq;
@@ -11,13 +12,6 @@ enum {
IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_CONCURRENT = 16,
- IO_WQ_WORK_FILES = 32,
- IO_WQ_WORK_FS = 64,
- IO_WQ_WORK_MM = 128,
- IO_WQ_WORK_CREDS = 256,
- IO_WQ_WORK_BLKCG = 512,
- IO_WQ_WORK_FSIZE = 1024,
-
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -85,8 +79,8 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
- struct io_identity *identity;
unsigned flags;
+ unsigned short personality;
};
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
@@ -100,16 +94,27 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
-struct io_wq_data {
- struct user_struct *user;
+struct io_wq_hash {
+ refcount_t refs;
+ unsigned long map;
+ struct wait_queue_head wait;
+};
+
+static inline void io_wq_put_hash(struct io_wq_hash *hash)
+{
+ if (refcount_dec_and_test(&hash->refs))
+ kfree(hash);
+}
+struct io_wq_data {
+ struct io_wq_hash *hash;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
-bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
-void io_wq_destroy(struct io_wq *wq);
+void io_wq_put(struct io_wq *wq);
+void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
@@ -124,8 +129,6 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all);
-struct task_struct *io_wq_get_task(struct io_wq *wq);
-
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
@@ -140,6 +143,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
static inline bool io_wq_current_is_worker(void)
{
- return in_task() && (current->flags & PF_IO_WORKER);
+ return in_task() && (current->flags & PF_IO_WORKER) &&
+ current->pf_io_worker;
}
#endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 14ce789927e4..92c25b5f1349 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -57,7 +57,6 @@
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
-#include <linux/kthread.h>
#include <linux/blkdev.h>
#include <linux/bvec.h>
#include <linux/net.h>
@@ -75,13 +74,11 @@
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
-#include <linux/fs_struct.h>
#include <linux/splice.h>
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
-#include <linux/blk-cgroup.h>
-#include <linux/audit.h>
+#include <linux/freezer.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -104,6 +101,10 @@
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+ IOSQE_BUFFER_SELECT)
+
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
@@ -232,6 +233,7 @@ struct fixed_rsrc_data {
struct fixed_rsrc_ref_node *node;
struct percpu_ref refs;
struct completion done;
+ bool quiesce;
};
struct io_buffer {
@@ -249,6 +251,11 @@ struct io_restriction {
bool registered;
};
+enum {
+ IO_SQ_THREAD_SHOULD_STOP = 0,
+ IO_SQ_THREAD_SHOULD_PARK,
+};
+
struct io_sq_data {
refcount_t refs;
struct mutex lock;
@@ -262,6 +269,13 @@ struct io_sq_data {
struct wait_queue_head wait;
unsigned sq_thread_idle;
+ int sq_cpu;
+ pid_t task_pid;
+
+ unsigned long state;
+ struct completion startup;
+ struct completion parked;
+ struct completion exited;
};
#define IO_IOPOLL_BATCH 8
@@ -279,8 +293,14 @@ struct io_comp_state {
struct list_head locked_free_list;
};
+struct io_submit_link {
+ struct io_kiocb *head;
+ struct io_kiocb *last;
+};
+
struct io_submit_state {
struct blk_plug plug;
+ struct io_submit_link link;
/*
* io_kiocb alloc cache
@@ -312,12 +332,11 @@ struct io_ring_ctx {
struct {
unsigned int flags;
unsigned int compat: 1;
- unsigned int limit_mem: 1;
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
- unsigned int sqo_dead: 1;
+ unsigned int sqo_exec: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -339,6 +358,9 @@ struct io_ring_ctx {
unsigned cached_cq_overflow;
unsigned long sq_check_overflow;
+ /* hashed buffered write serialization */
+ struct io_wq_hash *hash_map;
+
struct list_head defer_list;
struct list_head timeout_list;
struct list_head cq_overflow_list;
@@ -355,22 +377,9 @@ struct io_ring_ctx {
struct io_rings *rings;
- /* IO offload */
- struct io_wq *io_wq;
-
- /*
- * For SQPOLL usage - we hold a reference to the parent task, so we
- * have access to the ->files
- */
- struct task_struct *sqo_task;
-
/* Only used for accounting purposes */
struct mm_struct *mm_account;
-#ifdef CONFIG_BLK_CGROUP
- struct cgroup_subsys_state *sqo_blkcg_css;
-#endif
-
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
@@ -390,13 +399,6 @@ struct io_ring_ctx {
struct user_struct *user;
- const struct cred *creds;
-
-#ifdef CONFIG_AUDIT
- kuid_t loginuid;
- unsigned int sessionid;
-#endif
-
struct completion ref_comp;
struct completion sq_thread_comp;
@@ -445,6 +447,11 @@ struct io_ring_ctx {
struct io_restriction restrictions;
+ /* exit task_work */
+ struct callback_head *exit_task_work;
+
+ struct wait_queue_head hash_wait;
+
/* Keep this last, we don't need it for the fast path */
struct work_struct exit_work;
};
@@ -673,7 +680,6 @@ enum {
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
- REQ_F_WORK_INITIALIZED_BIT,
REQ_F_LTIMEOUT_ACTIVE_BIT,
REQ_F_COMPLETE_INLINE_BIT,
@@ -697,7 +703,7 @@ enum {
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
- /* on inflight list */
+ /* on inflight list, should be cancelled and waited on exit reliably */
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
@@ -715,8 +721,6 @@ enum {
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
- /* io_wq_work is initialized */
- REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
/* linked timeout is active, i.e. prepared by link's head */
REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
/* completion is deferred through io_comp_state */
@@ -827,7 +831,6 @@ struct io_op_def {
unsigned plug : 1;
/* size of async data needed, if any */
unsigned short async_size;
- unsigned work_flags;
};
static const struct io_op_def io_op_defs[] = {
@@ -840,7 +843,6 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_data = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITEV] = {
.needs_file = 1,
@@ -850,12 +852,9 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_data = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FSIZE,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
@@ -863,7 +862,6 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
@@ -872,8 +870,6 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
- IO_WQ_WORK_MM,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -882,7 +878,6 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
@@ -890,8 +885,6 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
@@ -900,29 +893,23 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_CONNECT] = {
.needs_file = 1,
@@ -930,26 +917,14 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_connect),
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
- },
- [IORING_OP_OPENAT] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS | IO_WQ_WORK_MM,
- },
- [IORING_OP_CLOSE] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
- },
- [IORING_OP_FILES_UPDATE] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
- },
- [IORING_OP_STATX] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
- IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
},
+ [IORING_OP_OPENAT] = {},
+ [IORING_OP_CLOSE] = {},
+ [IORING_OP_FILES_UPDATE] = {},
+ [IORING_OP_STATX] = {},
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -957,7 +932,6 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITE] = {
.needs_file = 1,
@@ -965,42 +939,31 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FSIZE,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
- },
- [IORING_OP_MADVISE] = {
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
+ [IORING_OP_MADVISE] = {},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_OPENAT2] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
- .work_flags = IO_WQ_WORK_FILES,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
@@ -1012,24 +975,18 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
},
- [IORING_OP_RENAMEAT] = {
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
- IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
- },
- [IORING_OP_UNLINKAT] = {
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
- IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
- },
+ [IORING_OP_RENAMEAT] = {},
+ [IORING_OP_UNLINKAT] = {},
};
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files);
+static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
struct io_ring_ctx *ctx);
-static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
- struct fixed_rsrc_ref_node *ref_node);
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static bool io_rw_reissue(struct io_kiocb *req);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
@@ -1112,190 +1069,20 @@ static bool io_match_task(struct io_kiocb *head,
return true;
io_for_each_link(req, head) {
- if (!(req->flags & REQ_F_WORK_INITIALIZED))
- continue;
- if (req->file && req->file->f_op == &io_uring_fops)
+ if (req->flags & REQ_F_INFLIGHT)
return true;
- if ((req->work.flags & IO_WQ_WORK_FILES) &&
- req->work.identity->files == files)
+ if (req->task->files == files)
return true;
}
return false;
}
-static void io_sq_thread_drop_mm_files(void)
-{
- struct files_struct *files = current->files;
- struct mm_struct *mm = current->mm;
-
- if (mm) {
- kthread_unuse_mm(mm);
- mmput(mm);
- current->mm = NULL;
- }
- if (files) {
- struct nsproxy *nsproxy = current->nsproxy;
-
- task_lock(current);
- current->files = NULL;
- current->nsproxy = NULL;
- task_unlock(current);
- put_files_struct(files);
- put_nsproxy(nsproxy);
- }
-}
-
-static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
-{
- if (!current->files) {
- struct files_struct *files;
- struct nsproxy *nsproxy;
-
- task_lock(ctx->sqo_task);
- files = ctx->sqo_task->files;
- if (!files) {
- task_unlock(ctx->sqo_task);
- return -EOWNERDEAD;
- }
- atomic_inc(&files->count);
- get_nsproxy(ctx->sqo_task->nsproxy);
- nsproxy = ctx->sqo_task->nsproxy;
- task_unlock(ctx->sqo_task);
-
- task_lock(current);
- current->files = files;
- current->nsproxy = nsproxy;
- task_unlock(current);
- }
- return 0;
-}
-
-static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
-{
- struct mm_struct *mm;
-
- if (current->mm)
- return 0;
-
- task_lock(ctx->sqo_task);
- mm = ctx->sqo_task->mm;
- if (unlikely(!mm || !mmget_not_zero(mm)))
- mm = NULL;
- task_unlock(ctx->sqo_task);
-
- if (mm) {
- kthread_use_mm(mm);
- return 0;
- }
-
- return -EFAULT;
-}
-
-static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- int ret;
-
- if (def->work_flags & IO_WQ_WORK_MM) {
- ret = __io_sq_thread_acquire_mm(ctx);
- if (unlikely(ret))
- return ret;
- }
-
- if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
- ret = __io_sq_thread_acquire_files(ctx);
- if (unlikely(ret))
- return ret;
- }
-
- return 0;
-}
-
-static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- if (!(ctx->flags & IORING_SETUP_SQPOLL))
- return 0;
- return __io_sq_thread_acquire_mm_files(ctx, req);
-}
-
-static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
- struct cgroup_subsys_state **cur_css)
-
-{
-#ifdef CONFIG_BLK_CGROUP
- /* puts the old one when swapping */
- if (*cur_css != ctx->sqo_blkcg_css) {
- kthread_associate_blkcg(ctx->sqo_blkcg_css);
- *cur_css = ctx->sqo_blkcg_css;
- }
-#endif
-}
-
-static void io_sq_thread_unassociate_blkcg(void)
-{
-#ifdef CONFIG_BLK_CGROUP
- kthread_associate_blkcg(NULL);
-#endif
-}
-
static inline void req_set_fail_links(struct io_kiocb *req)
{
if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
req->flags |= REQ_F_FAIL_LINK;
}
-/*
- * None of these are dereferenced, they are simply used to check if any of
- * them have changed. If we're under current and check they are still the
- * same, we're fine to grab references to them for actual out-of-line use.
- */
-static void io_init_identity(struct io_identity *id)
-{
- id->files = current->files;
- id->mm = current->mm;
-#ifdef CONFIG_BLK_CGROUP
- rcu_read_lock();
- id->blkcg_css = blkcg_css();
- rcu_read_unlock();
-#endif
- id->creds = current_cred();
- id->nsproxy = current->nsproxy;
- id->fs = current->fs;
- id->fsize = rlimit(RLIMIT_FSIZE);
-#ifdef CONFIG_AUDIT
- id->loginuid = current->loginuid;
- id->sessionid = current->sessionid;
-#endif
- refcount_set(&id->count, 1);
-}
-
-static inline void __io_req_init_async(struct io_kiocb *req)
-{
- memset(&req->work, 0, sizeof(req->work));
- req->flags |= REQ_F_WORK_INITIALIZED;
-}
-
-/*
- * Note: must call io_req_init_async() for the first time you
- * touch any members of io_wq_work.
- */
-static inline void io_req_init_async(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = current->io_uring;
-
- if (req->flags & REQ_F_WORK_INITIALIZED)
- return;
-
- __io_req_init_async(req);
-
- /* Grab a ref if this isn't our static identity */
- req->work.identity = tctx->identity;
- if (tctx->identity != &tctx->__identity)
- refcount_inc(&req->work.identity->count);
-}
-
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1378,111 +1165,11 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
-{
- if (req->work.identity == &tctx->__identity)
- return;
- if (refcount_dec_and_test(&req->work.identity->count))
- kfree(req->work.identity);
-}
-
-static void io_req_clean_work(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_WORK_INITIALIZED))
- return;
-
- if (req->work.flags & IO_WQ_WORK_MM)
- mmdrop(req->work.identity->mm);
-#ifdef CONFIG_BLK_CGROUP
- if (req->work.flags & IO_WQ_WORK_BLKCG)
- css_put(req->work.identity->blkcg_css);
-#endif
- if (req->work.flags & IO_WQ_WORK_CREDS)
- put_cred(req->work.identity->creds);
- if (req->work.flags & IO_WQ_WORK_FS) {
- struct fs_struct *fs = req->work.identity->fs;
-
- spin_lock(&req->work.identity->fs->lock);
- if (--fs->users)
- fs = NULL;
- spin_unlock(&req->work.identity->fs->lock);
- if (fs)
- free_fs_struct(fs);
- }
- if (req->work.flags & IO_WQ_WORK_FILES) {
- put_files_struct(req->work.identity->files);
- put_nsproxy(req->work.identity->nsproxy);
- }
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_uring_task *tctx = req->task->io_uring;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- list_del(&req->inflight_entry);
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
- req->flags &= ~REQ_F_INFLIGHT;
- if (atomic_read(&tctx->in_idle))
- wake_up(&tctx->wait);
- }
-
- req->flags &= ~REQ_F_WORK_INITIALIZED;
- req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS |
- IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES);
- io_put_identity(req->task->io_uring, req);
-}
-
-/*
- * Create a private copy of io_identity, since some fields don't match
- * the current context.
- */
-static bool io_identity_cow(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = current->io_uring;
- const struct cred *creds = NULL;
- struct io_identity *id;
-
- if (req->work.flags & IO_WQ_WORK_CREDS)
- creds = req->work.identity->creds;
-
- id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
- if (unlikely(!id)) {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- return false;
- }
-
- /*
- * We can safely just re-init the creds we copied Either the field
- * matches the current one, or we haven't grabbed it yet. The only
- * exception is ->creds, through registered personalities, so handle
- * that one separately.
- */
- io_init_identity(id);
- if (creds)
- id->creds = creds;
-
- /* add one for this request */
- refcount_inc(&id->count);
-
- /* drop tctx and req identity references, if needed */
- if (tctx->identity != &tctx->__identity &&
- refcount_dec_and_test(&tctx->identity->count))
- kfree(tctx->identity);
- if (req->work.identity != &tctx->__identity &&
- refcount_dec_and_test(&req->work.identity->count))
- kfree(req->work.identity);
-
- req->work.identity = id;
- tctx->identity = id;
- return true;
-}
-
static void io_req_track_inflight(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_INFLIGHT)) {
- io_req_init_async(req);
req->flags |= REQ_F_INFLIGHT;
spin_lock_irq(&ctx->inflight_lock);
@@ -1491,86 +1178,11 @@ static void io_req_track_inflight(struct io_kiocb *req)
}
}
-static bool io_grab_identity(struct io_kiocb *req)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- struct io_identity *id = req->work.identity;
-
- if (def->work_flags & IO_WQ_WORK_FSIZE) {
- if (id->fsize != rlimit(RLIMIT_FSIZE))
- return false;
- req->work.flags |= IO_WQ_WORK_FSIZE;
- }
-#ifdef CONFIG_BLK_CGROUP
- if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
- (def->work_flags & IO_WQ_WORK_BLKCG)) {
- rcu_read_lock();
- if (id->blkcg_css != blkcg_css()) {
- rcu_read_unlock();
- return false;
- }
- /*
- * This should be rare, either the cgroup is dying or the task
- * is moving cgroups. Just punt to root for the handful of ios.
- */
- if (css_tryget_online(id->blkcg_css))
- req->work.flags |= IO_WQ_WORK_BLKCG;
- rcu_read_unlock();
- }
-#endif
- if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
- if (id->creds != current_cred())
- return false;
- get_cred(id->creds);
- req->work.flags |= IO_WQ_WORK_CREDS;
- }
-#ifdef CONFIG_AUDIT
- if (!uid_eq(current->loginuid, id->loginuid) ||
- current->sessionid != id->sessionid)
- return false;
-#endif
- if (!(req->work.flags & IO_WQ_WORK_FS) &&
- (def->work_flags & IO_WQ_WORK_FS)) {
- if (current->fs != id->fs)
- return false;
- spin_lock(&id->fs->lock);
- if (!id->fs->in_exec) {
- id->fs->users++;
- req->work.flags |= IO_WQ_WORK_FS;
- } else {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- }
- spin_unlock(&current->fs->lock);
- }
- if (!(req->work.flags & IO_WQ_WORK_FILES) &&
- (def->work_flags & IO_WQ_WORK_FILES) &&
- !(req->flags & REQ_F_NO_FILE_TABLE)) {
- if (id->files != current->files ||
- id->nsproxy != current->nsproxy)
- return false;
- atomic_inc(&id->files->count);
- get_nsproxy(id->nsproxy);
- req->work.flags |= IO_WQ_WORK_FILES;
- io_req_track_inflight(req);
- }
- if (!(req->work.flags & IO_WQ_WORK_MM) &&
- (def->work_flags & IO_WQ_WORK_MM)) {
- if (id->mm != current->mm)
- return false;
- mmgrab(id->mm);
- req->work.flags |= IO_WQ_WORK_MM;
- }
-
- return true;
-}
-
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
- io_req_init_async(req);
-
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
@@ -1581,17 +1193,6 @@ static void io_prep_async_work(struct io_kiocb *req)
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
-
- /* if we fail grabbing identity, we must COW, regrab, and retry */
- if (io_grab_identity(req))
- return;
-
- if (!io_identity_cow(req))
- return;
-
- /* can't fail at this point */
- if (!io_grab_identity(req))
- WARN_ON(1);
}
static void io_prep_async_link(struct io_kiocb *req)
@@ -1602,25 +1203,20 @@ static void io_prep_async_link(struct io_kiocb *req)
io_prep_async_work(cur);
}
-static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
+static void io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ BUG_ON(!tctx);
+ BUG_ON(!tctx->io_wq);
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
- io_wq_enqueue(ctx->io_wq, &req->work);
- return link;
-}
-
-static void io_queue_async_work(struct io_kiocb *req)
-{
- struct io_kiocb *link;
-
/* init ->work of the whole link before punting */
io_prep_async_link(req);
- link = __io_queue_async_work(req);
-
+ io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
}
@@ -1855,18 +1451,22 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
return all_flushed;
}
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
struct files_struct *files)
{
+ bool ret = true;
+
if (test_bit(0, &ctx->cq_check_overflow)) {
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
- __io_cqring_overflow_flush(ctx, force, tsk, files);
+ ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&ctx->uring_lock);
}
+
+ return ret;
}
static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
@@ -2048,9 +1648,19 @@ static void io_dismantle_req(struct io_kiocb *req)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
if (req->fixed_rsrc_refs)
percpu_ref_put(req->fixed_rsrc_refs);
- io_req_clean_work(req);
+
+ if (req->flags & REQ_F_INFLIGHT) {
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->inflight_lock, flags);
+ list_del(&req->inflight_entry);
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ req->flags &= ~REQ_F_INFLIGHT;
+ }
}
+/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
struct io_uring_task *tctx = task->io_uring;
@@ -2134,15 +1744,7 @@ static void io_fail_links(struct io_kiocb *req)
trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
- /*
- * It's ok to free under spinlock as they're not linked anymore,
- * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
- * work.fs->lock.
- */
- if (link->flags & REQ_F_WORK_INITIALIZED)
- io_put_req_deferred(link, 2);
- else
- io_double_put_req(link);
+ io_put_req_deferred(link, 2);
link = nxt;
}
io_commit_cqring(ctx);
@@ -2179,6 +1781,18 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
return __io_req_find_next(req);
}
+static void ctx_flush_and_put(struct io_ring_ctx *ctx)
+{
+ if (!ctx)
+ return;
+ if (ctx->submit_state.comp.nr) {
+ mutex_lock(&ctx->uring_lock);
+ io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+ mutex_unlock(&ctx->uring_lock);
+ }
+ percpu_ref_put(&ctx->refs);
+}
+
static bool __tctx_task_work(struct io_uring_task *tctx)
{
struct io_ring_ctx *ctx = NULL;
@@ -2196,30 +1810,20 @@ static bool __tctx_task_work(struct io_uring_task *tctx)
node = list.first;
while (node) {
struct io_wq_work_node *next = node->next;
- struct io_ring_ctx *this_ctx;
struct io_kiocb *req;
req = container_of(node, struct io_kiocb, io_task_work.node);
- this_ctx = req->ctx;
- req->task_work.func(&req->task_work);
- node = next;
-
- if (!ctx) {
- ctx = this_ctx;
- } else if (ctx != this_ctx) {
- mutex_lock(&ctx->uring_lock);
- io_submit_flush_completions(&ctx->submit_state.comp, ctx);
- mutex_unlock(&ctx->uring_lock);
- ctx = this_ctx;
+ if (req->ctx != ctx) {
+ ctx_flush_and_put(ctx);
+ ctx = req->ctx;
+ percpu_ref_get(&ctx->refs);
}
- }
- if (ctx && ctx->submit_state.comp.nr) {
- mutex_lock(&ctx->uring_lock);
- io_submit_flush_completions(&ctx->submit_state.comp, ctx);
- mutex_unlock(&ctx->uring_lock);
+ req->task_work.func(&req->task_work);
+ node = next;
}
+ ctx_flush_and_put(ctx);
return list.first != NULL;
}
@@ -2227,10 +1831,10 @@ static void tctx_task_work(struct callback_head *cb)
{
struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
+ clear_bit(0, &tctx->task_state);
+
while (__tctx_task_work(tctx))
cond_resched();
-
- clear_bit(0, &tctx->task_state);
}
static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
@@ -2303,11 +1907,14 @@ static int io_req_task_work_add(struct io_kiocb *req)
static void io_req_task_work_add_fallback(struct io_kiocb *req,
task_work_func_t cb)
{
- struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct callback_head *head;
init_task_work(&req->task_work, cb);
- task_work_add(tsk, &req->task_work, TWA_NONE);
- wake_up_process(tsk);
+ do {
+ head = READ_ONCE(ctx->exit_task_work);
+ req->task_work.next = head;
+ } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
}
static void __io_req_task_cancel(struct io_kiocb *req, int error)
@@ -2329,7 +1936,9 @@ static void io_req_task_cancel(struct callback_head *cb)
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct io_ring_ctx *ctx = req->ctx;
- __io_req_task_cancel(req, -ECANCELED);
+ mutex_lock(&ctx->uring_lock);
+ __io_req_task_cancel(req, req->result);
+ mutex_unlock(&ctx->uring_lock);
percpu_ref_put(&ctx->refs);
}
@@ -2339,15 +1948,11 @@ static void __io_req_task_submit(struct io_kiocb *req)
/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
mutex_lock(&ctx->uring_lock);
- if (!ctx->sqo_dead && !(current->flags & PF_EXITING) &&
- !io_sq_thread_acquire_mm_files(ctx, req))
+ if (!(current->flags & PF_EXITING) && !current->in_execve)
__io_queue_sqe(req);
else
__io_req_task_cancel(req, -EFAULT);
mutex_unlock(&ctx->uring_lock);
-
- if (ctx->flags & IORING_SETUP_SQPOLL)
- io_sq_thread_drop_mm_files();
}
static void io_req_task_submit(struct callback_head *cb)
@@ -2364,11 +1969,22 @@ static void io_req_task_queue(struct io_kiocb *req)
req->task_work.func = io_req_task_submit;
ret = io_req_task_work_add(req);
if (unlikely(ret)) {
+ req->result = -ECANCELED;
percpu_ref_get(&req->ctx->refs);
io_req_task_work_add_fallback(req, io_req_task_cancel);
}
}
+static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
+{
+ percpu_ref_get(&req->ctx->refs);
+ req->result = ret;
+ req->task_work.func = io_req_task_cancel;
+
+ if (unlikely(io_req_task_work_add(req)))
+ io_req_task_work_add_fallback(req, io_req_task_cancel);
+}
+
static inline void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
@@ -2794,24 +2410,37 @@ static bool io_resubmit_prep(struct io_kiocb *req)
return false;
return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
}
-#endif
-static bool io_rw_reissue(struct io_kiocb *req)
+static bool io_rw_should_reissue(struct io_kiocb *req)
{
-#ifdef CONFIG_BLOCK
umode_t mode = file_inode(req->file)->i_mode;
- int ret;
+ struct io_ring_ctx *ctx = req->ctx;
if (!S_ISBLK(mode) && !S_ISREG(mode))
return false;
- if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
+ if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
+ !(ctx->flags & IORING_SETUP_IOPOLL)))
return false;
+ /*
+ * If ref is dying, we might be running poll reap from the exit work.
+ * Don't attempt to reissue from that path, just let it fail with
+ * -EAGAIN.
+ */
+ if (percpu_ref_is_dying(&ctx->refs))
+ return false;
+ return true;
+}
+#endif
- lockdep_assert_held(&req->ctx->uring_lock);
+static bool io_rw_reissue(struct io_kiocb *req)
+{
+#ifdef CONFIG_BLOCK
+ if (!io_rw_should_reissue(req))
+ return false;
- ret = io_sq_thread_acquire_mm_files(req->ctx, req);
+ lockdep_assert_held(&req->ctx->uring_lock);
- if (!ret && io_resubmit_prep(req)) {
+ if (io_resubmit_prep(req)) {
refcount_inc(&req->refs);
io_queue_async_work(req);
return true;
@@ -2849,6 +2478,19 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+#ifdef CONFIG_BLOCK
+ /* Rewind iter, if we have one. iopoll path resubmits as usual */
+ if (res == -EAGAIN && io_rw_should_reissue(req)) {
+ struct io_async_rw *rw = req->async_data;
+
+ if (rw)
+ iov_iter_revert(&rw->iter,
+ req->result - iov_iter_count(&rw->iter));
+ else if (!io_resubmit_prep(req))
+ res = -EIO;
+ }
+#endif
+
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
@@ -3467,19 +3109,9 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- ssize_t ret;
-
- ret = io_prep_rw(req, sqe);
- if (ret)
- return ret;
-
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
-
- /* either don't need iovec imported or already have it */
- if (!req->async_data)
- return 0;
- return io_rw_prep_async(req, READ);
+ return io_prep_rw(req, sqe);
}
/*
@@ -3607,10 +3239,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = io_iter_do_read(req, iter);
if (ret == -EIOCBQUEUED) {
- /* it's faster to check here then delegate to kfree */
- if (iovec)
- kfree(iovec);
- return 0;
+ if (req->async_data)
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
+ goto out_free;
} else if (ret == -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -3631,6 +3262,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret2)
return ret2;
+ iovec = NULL;
rw = req->async_data;
/* now use our persistent iterator, if we aren't already */
iter = &rw->iter;
@@ -3654,27 +3286,22 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
+ kiocb->ki_flags &= ~IOCB_WAITQ;
} while (ret > 0 && ret < io_size);
done:
kiocb_done(kiocb, ret, issue_flags);
+out_free:
+ /* it's faster to check here then delegate to kfree */
+ if (iovec)
+ kfree(iovec);
return 0;
}
static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- ssize_t ret;
-
- ret = io_prep_rw(req, sqe);
- if (ret)
- return ret;
-
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
-
- /* either don't need iovec imported or already have it */
- if (!req->async_data)
- return 0;
- return io_rw_prep_async(req, WRITE);
+ return io_prep_rw(req, sqe);
}
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
@@ -3746,6 +3373,8 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
goto done;
+ if (ret2 == -EIOCBQUEUED && req->async_data)
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
@@ -3924,7 +3553,6 @@ static int __io_splice_prep(struct io_kiocb *req,
* Splice operation will be punted aync, and here need to
* modify io_wq_work.flags, so initialize io_wq_work firstly.
*/
- io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
@@ -4011,7 +3639,7 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
-static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -4200,7 +3828,7 @@ err:
static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
{
- return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK);
+ return io_openat2(req, issue_flags);
}
static int io_remove_buffers_prep(struct io_kiocb *req,
@@ -4598,13 +4226,10 @@ err:
return 0;
}
-static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!req->file)
- return -EBADF;
-
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
@@ -4664,11 +4289,21 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
req->sr_msg.msg_flags, &iomsg->free_iov);
}
+static int io_sendmsg_prep_async(struct io_kiocb *req)
+{
+ int ret;
+
+ if (!io_op_defs[req->opcode].needs_async_data)
+ return 0;
+ ret = io_sendmsg_copy_hdr(req, req->async_data);
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
+}
+
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_async_msghdr *async_msg = req->async_data;
struct io_sr_msg *sr = &req->sr_msg;
- int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4681,13 +4316,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
-
- if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
- return 0;
- ret = io_sendmsg_copy_hdr(req, async_msg);
- if (!ret)
- req->flags |= REQ_F_NEED_CLEANUP;
- return ret;
+ return 0;
}
static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
@@ -4881,13 +4510,22 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
return io_put_kbuf(req, req->sr_msg.kbuf);
}
-static int io_recvmsg_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_recvmsg_prep_async(struct io_kiocb *req)
{
- struct io_async_msghdr *async_msg = req->async_data;
- struct io_sr_msg *sr = &req->sr_msg;
int ret;
+ if (!io_op_defs[req->opcode].needs_async_data)
+ return 0;
+ ret = io_recvmsg_copy_hdr(req, req->async_data);
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
+}
+
+static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_sr_msg *sr = &req->sr_msg;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -4900,13 +4538,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
-
- if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
- return 0;
- ret = io_recvmsg_copy_hdr(req, async_msg);
- if (!ret)
- req->flags |= REQ_F_NEED_CLEANUP;
- return ret;
+ return 0;
}
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
@@ -5059,10 +4691,17 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static int io_connect_prep_async(struct io_kiocb *req)
+{
+ struct io_async_connect *io = req->async_data;
+ struct io_connect *conn = &req->connect;
+
+ return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
+}
+
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
- struct io_async_connect *io = req->async_data;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -5071,12 +4710,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
conn->addr_len = READ_ONCE(sqe->addr2);
-
- if (!io)
- return 0;
-
- return move_addr_to_kernel(conn->addr, conn->addr_len,
- &io->address);
+ return 0;
}
static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
@@ -5121,56 +4755,32 @@ out:
return 0;
}
#else /* !CONFIG_NET */
-static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_send(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_recvmsg_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
+#define IO_NETOP_FN(op) \
+static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
+{ \
+ return -EOPNOTSUPP; \
+}
+
+#define IO_NETOP_PREP(op) \
+IO_NETOP_FN(op) \
+static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
+{ \
+ return -EOPNOTSUPP; \
+} \
+
+#define IO_NETOP_PREP_ASYNC(op) \
+IO_NETOP_PREP(op) \
+static int io_##op##_prep_async(struct io_kiocb *req) \
+{ \
+ return -EOPNOTSUPP; \
+}
+
+IO_NETOP_PREP_ASYNC(sendmsg);
+IO_NETOP_PREP_ASYNC(recvmsg);
+IO_NETOP_PREP_ASYNC(connect);
+IO_NETOP_PREP(accept);
+IO_NETOP_FN(send);
+IO_NETOP_FN(recv);
#endif /* CONFIG_NET */
struct io_poll_table {
@@ -5357,6 +4967,9 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
pt->error = -EINVAL;
return;
}
+ /* double add on the same waitqueue head, ignore */
+ if (poll->head == head)
+ return;
poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
if (!poll) {
pt->error = -ENOMEM;
@@ -5892,6 +5505,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
data->mode = io_translate_timeout_mode(flags);
hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+ io_req_track_inflight(req);
return 0;
}
@@ -5952,12 +5566,15 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
return req->user_data == (unsigned long) data;
}
-static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
+static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
{
enum io_wq_cancel cancel_ret;
int ret = 0;
- cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
+ if (!tctx->io_wq)
+ return -ENOENT;
+
+ cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
@@ -5980,7 +5597,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
unsigned long flags;
int ret;
- ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+ ret = io_async_cancel_one(req->task->io_uring,
+ (void *) (unsigned long) sqe_addr);
if (ret != -ENOENT) {
spin_lock_irqsave(&ctx->completion_lock, flags);
goto done;
@@ -6084,9 +5702,9 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
case IORING_OP_POLL_REMOVE:
return io_poll_remove_prep(req, sqe);
case IORING_OP_FSYNC:
- return io_prep_fsync(req, sqe);
+ return io_fsync_prep(req, sqe);
case IORING_OP_SYNC_FILE_RANGE:
- return io_prep_sfr(req, sqe);
+ return io_sfr_prep(req, sqe);
case IORING_OP_SENDMSG:
case IORING_OP_SEND:
return io_sendmsg_prep(req, sqe);
@@ -6144,14 +5762,39 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return-EINVAL;
}
-static int io_req_defer_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_req_prep_async(struct io_kiocb *req)
+{
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
+ return io_rw_prep_async(req, READ);
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
+ return io_rw_prep_async(req, WRITE);
+ case IORING_OP_SENDMSG:
+ case IORING_OP_SEND:
+ return io_sendmsg_prep_async(req);
+ case IORING_OP_RECVMSG:
+ case IORING_OP_RECV:
+ return io_recvmsg_prep_async(req);
+ case IORING_OP_CONNECT:
+ return io_connect_prep_async(req);
+ }
+ return 0;
+}
+
+static int io_req_defer_prep(struct io_kiocb *req)
{
- if (!sqe)
+ if (!io_op_defs[req->opcode].needs_async_data)
return 0;
- if (io_alloc_async_data(req))
+ /* some opcodes init it during the inital prep */
+ if (req->async_data)
+ return 0;
+ if (__io_alloc_async_data(req))
return -EAGAIN;
- return io_req_prep(req, sqe);
+ return io_req_prep_async(req);
}
static u32 io_get_sequence(struct io_kiocb *req)
@@ -6167,7 +5810,7 @@ static u32 io_get_sequence(struct io_kiocb *req)
return total_submitted - nr_reqs;
}
-static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_req_defer(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
@@ -6184,11 +5827,9 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
return 0;
- if (!req->async_data) {
- ret = io_req_defer_prep(req, sqe);
- if (ret)
- return ret;
- }
+ ret = io_req_defer_prep(req);
+ if (ret)
+ return ret;
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de)
@@ -6272,8 +5913,22 @@ static void __io_clean_op(struct io_kiocb *req)
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ const struct cred *creds = NULL;
int ret;
+ if (req->work.personality) {
+ const struct cred *new_creds;
+
+ if (!(issue_flags & IO_URING_F_NONBLOCK))
+ mutex_lock(&ctx->uring_lock);
+ new_creds = idr_find(&ctx->personality_idr, req->work.personality);
+ if (!(issue_flags & IO_URING_F_NONBLOCK))
+ mutex_unlock(&ctx->uring_lock);
+ if (!new_creds)
+ return -EINVAL;
+ creds = override_creds(new_creds);
+ }
+
switch (req->opcode) {
case IORING_OP_NOP:
ret = io_nop(req, issue_flags);
@@ -6380,6 +6035,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
break;
}
+ if (creds)
+ revert_creds(creds);
+
if (ret)
return ret;
@@ -6427,29 +6085,11 @@ static void io_wq_submit_work(struct io_wq_work *work)
} while (1);
}
+ /* avoid locking problems by failing it from a clean context */
if (ret) {
- struct io_ring_ctx *lock_ctx = NULL;
-
- if (req->ctx->flags & IORING_SETUP_IOPOLL)
- lock_ctx = req->ctx;
-
- /*
- * io_iopoll_complete() does not hold completion_lock to
- * complete polled io, so here for polled io, we can not call
- * io_req_complete() directly, otherwise there maybe concurrent
- * access to cqring, defer_list, etc, which is not safe. Given
- * that io_iopoll_complete() is always called under uring_lock,
- * so here for polled io, we also get uring_lock to complete
- * it.
- */
- if (lock_ctx)
- mutex_lock(&lock_ctx->uring_lock);
-
- req_set_fail_links(req);
- io_req_complete(req, ret);
-
- if (lock_ctx)
- mutex_unlock(&lock_ctx->uring_lock);
+ /* io-wq is going to take one down */
+ refcount_inc(&req->refs);
+ io_req_task_queue_fail(req, ret);
}
}
@@ -6561,19 +6201,10 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
static void __io_queue_sqe(struct io_kiocb *req)
{
struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
- const struct cred *old_creds = NULL;
int ret;
- if ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_CREDS) &&
- req->work.identity->creds != current_cred())
- old_creds = override_creds(req->work.identity->creds);
-
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
- if (old_creds)
- revert_creds(old_creds);
-
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
@@ -6607,11 +6238,11 @@ static void __io_queue_sqe(struct io_kiocb *req)
io_queue_linked_timeout(linked_timeout);
}
-static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void io_queue_sqe(struct io_kiocb *req)
{
int ret;
- ret = io_req_defer(req, sqe);
+ ret = io_req_defer(req);
if (ret) {
if (ret != -EIOCBQUEUED) {
fail_req:
@@ -6620,42 +6251,133 @@ fail_req:
io_req_complete(req, ret);
}
} else if (req->flags & REQ_F_FORCE_ASYNC) {
- if (!req->async_data) {
- ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
- }
+ ret = io_req_defer_prep(req);
+ if (unlikely(ret))
+ goto fail_req;
io_queue_async_work(req);
} else {
- if (sqe) {
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
- }
__io_queue_sqe(req);
}
}
-static inline void io_queue_link_head(struct io_kiocb *req)
+/*
+ * Check SQE restrictions (opcode and flags).
+ *
+ * Returns 'true' if SQE is allowed, 'false' otherwise.
+ */
+static inline bool io_check_restriction(struct io_ring_ctx *ctx,
+ struct io_kiocb *req,
+ unsigned int sqe_flags)
{
- if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
- io_put_req(req);
- io_req_complete(req, -ECANCELED);
- } else
- io_queue_sqe(req, NULL);
+ if (!ctx->restricted)
+ return true;
+
+ if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
+ return false;
+
+ if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
+ ctx->restrictions.sqe_flags_required)
+ return false;
+
+ if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
+ ctx->restrictions.sqe_flags_required))
+ return false;
+
+ return true;
}
-struct io_submit_link {
- struct io_kiocb *head;
- struct io_kiocb *last;
-};
+static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_submit_state *state;
+ unsigned int sqe_flags;
+ int ret = 0;
+
+ req->opcode = READ_ONCE(sqe->opcode);
+ /* same numerical values with corresponding REQ_F_*, safe to copy */
+ req->flags = sqe_flags = READ_ONCE(sqe->flags);
+ req->user_data = READ_ONCE(sqe->user_data);
+ req->async_data = NULL;
+ req->file = NULL;
+ req->ctx = ctx;
+ req->link = NULL;
+ req->fixed_rsrc_refs = NULL;
+ /* one is dropped after submission, the other at completion */
+ refcount_set(&req->refs, 2);
+ req->task = current;
+ req->result = 0;
+
+ /* enforce forwards compatibility on users */
+ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
+ req->flags = 0;
+ return -EINVAL;
+ }
+
+ if (unlikely(req->opcode >= IORING_OP_LAST))
+ return -EINVAL;
+
+ if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
+ return -EACCES;
+
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[req->opcode].buffer_select)
+ return -EOPNOTSUPP;
+
+ req->work.list.next = NULL;
+ req->work.flags = 0;
+ req->work.personality = READ_ONCE(sqe->personality);
+ state = &ctx->submit_state;
+
+ /*
+ * Plug now if we have more than 1 IO left after this, and the target
+ * is potentially a read/write to block based storage.
+ */
+ if (!state->plug_started && state->ios_left > 1 &&
+ io_op_defs[req->opcode].plug) {
+ blk_start_plug(&state->plug);
+ state->plug_started = true;
+ }
+
+ if (io_op_defs[req->opcode].needs_file) {
+ bool fixed = req->flags & REQ_F_FIXED_FILE;
+
+ req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+ if (unlikely(!req->file))
+ ret = -EBADF;
+ }
+
+ state->ios_left--;
+ return ret;
+}
-static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_submit_link *link)
+static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct io_submit_link *link = &ctx->submit_state.link;
int ret;
+ ret = io_init_req(ctx, req, sqe);
+ if (unlikely(ret)) {
+fail_req:
+ io_put_req(req);
+ io_req_complete(req, ret);
+ if (link->head) {
+ /* fail even hard links since we don't submit */
+ link->head->flags |= REQ_F_FAIL_LINK;
+ io_put_req(link->head);
+ io_req_complete(link->head, -ECANCELED);
+ link->head = NULL;
+ }
+ return ret;
+ }
+ ret = io_req_prep(req, sqe);
+ if (unlikely(ret))
+ goto fail_req;
+
+ /* don't need @sqe from now on */
+ trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
+ true, ctx->flags & IORING_SETUP_SQPOLL);
+
/*
* If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but
@@ -6677,19 +6399,16 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
- ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret)) {
- /* fail even hard links since we don't submit */
- head->flags |= REQ_F_FAIL_LINK;
- return ret;
- }
+ ret = io_req_defer_prep(req);
+ if (unlikely(ret))
+ goto fail_req;
trace_io_uring_link(ctx, req, head);
link->last->link = req;
link->last = req;
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- io_queue_link_head(head);
+ io_queue_sqe(head);
link->head = NULL;
}
} else {
@@ -6698,13 +6417,10 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ctx->drain_next = 0;
}
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret))
- req->flags |= REQ_F_FAIL_LINK;
link->head = req;
link->last = req;
} else {
- io_queue_sqe(req, sqe);
+ io_queue_sqe(req);
}
}
@@ -6717,6 +6433,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static void io_submit_state_end(struct io_submit_state *state,
struct io_ring_ctx *ctx)
{
+ if (state->link.head)
+ io_queue_sqe(state->link.head);
if (state->comp.nr)
io_submit_flush_completions(&state->comp, ctx);
if (state->plug_started)
@@ -6732,6 +6450,8 @@ static void io_submit_state_start(struct io_submit_state *state,
{
state->plug_started = false;
state->ios_left = max_ios;
+ /* set only head, no need to init link_last in advance */
+ state->link.head = NULL;
}
static void io_commit_sqring(struct io_ring_ctx *ctx)
@@ -6777,117 +6497,9 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
return NULL;
}
-/*
- * Check SQE restrictions (opcode and flags).
- *
- * Returns 'true' if SQE is allowed, 'false' otherwise.
- */
-static inline bool io_check_restriction(struct io_ring_ctx *ctx,
- struct io_kiocb *req,
- unsigned int sqe_flags)
-{
- if (!ctx->restricted)
- return true;
-
- if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
- return false;
-
- if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
- ctx->restrictions.sqe_flags_required)
- return false;
-
- if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
- ctx->restrictions.sqe_flags_required))
- return false;
-
- return true;
-}
-
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
-
-static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_submit_state *state;
- unsigned int sqe_flags;
- int id, ret = 0;
-
- req->opcode = READ_ONCE(sqe->opcode);
- /* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags = sqe_flags = READ_ONCE(sqe->flags);
- req->user_data = READ_ONCE(sqe->user_data);
- req->async_data = NULL;
- req->file = NULL;
- req->ctx = ctx;
- req->link = NULL;
- req->fixed_rsrc_refs = NULL;
- /* one is dropped after submission, the other at completion */
- refcount_set(&req->refs, 2);
- req->task = current;
- req->result = 0;
-
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
- return -EINVAL;
-
- if (unlikely(req->opcode >= IORING_OP_LAST))
- return -EINVAL;
-
- if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
- return -EFAULT;
-
- if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
- return -EACCES;
-
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select)
- return -EOPNOTSUPP;
-
- id = READ_ONCE(sqe->personality);
- if (id) {
- struct io_identity *iod;
-
- iod = idr_find(&ctx->personality_idr, id);
- if (unlikely(!iod))
- return -EINVAL;
- refcount_inc(&iod->count);
-
- __io_req_init_async(req);
- get_cred(iod->creds);
- req->work.identity = iod;
- req->work.flags |= IO_WQ_WORK_CREDS;
- }
-
- state = &ctx->submit_state;
-
- /*
- * Plug now if we have more than 1 IO left after this, and the target
- * is potentially a read/write to block based storage.
- */
- if (!state->plug_started && state->ios_left > 1 &&
- io_op_defs[req->opcode].plug) {
- blk_start_plug(&state->plug);
- state->plug_started = true;
- }
-
- if (io_op_defs[req->opcode].needs_file) {
- bool fixed = req->flags & REQ_F_FIXED_FILE;
-
- req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
- if (unlikely(!req->file))
- ret = -EBADF;
- }
-
- state->ios_left--;
- return ret;
-}
-
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
- struct io_submit_link link;
- int i, submitted = 0;
+ int submitted = 0;
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
@@ -6903,14 +6515,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
percpu_counter_add(&current->io_uring->inflight, nr);
refcount_add(nr, &current->usage);
-
io_submit_state_start(&ctx->submit_state, nr);
- link.head = NULL;
- for (i = 0; i < nr; i++) {
+ while (submitted < nr) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- int err;
req = io_alloc_req(ctx);
if (unlikely(!req)) {
@@ -6925,20 +6534,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
}
/* will complete beyond this point, count as submitted */
submitted++;
-
- err = io_init_req(ctx, req, sqe);
- if (unlikely(err)) {
-fail_req:
- io_put_req(req);
- io_req_complete(req, err);
+ if (io_submit_sqe(ctx, req, sqe))
break;
- }
-
- trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
- true, ctx->flags & IORING_SETUP_SQPOLL);
- err = io_submit_sqe(req, sqe, &link);
- if (err)
- goto fail_req;
}
if (unlikely(submitted != nr)) {
@@ -6950,10 +6547,8 @@ fail_req:
percpu_counter_sub(&tctx->inflight, unused);
put_task_struct_many(current, unused);
}
- if (link.head)
- io_queue_link_head(link.head);
- io_submit_state_end(&ctx->submit_state, ctx);
+ io_submit_state_end(&ctx->submit_state, ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -6992,8 +6587,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (!list_empty(&ctx->iopoll_list))
io_do_iopoll(ctx, &nr_events, 0);
- if (to_submit && !ctx->sqo_dead &&
- likely(!percpu_ref_is_dying(&ctx->refs)))
+ if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
}
@@ -7030,71 +6624,94 @@ static void io_sqd_init_new(struct io_sq_data *sqd)
io_sqd_update_thread_idle(sqd);
}
+static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
+{
+ return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+}
+
+static bool io_sq_thread_should_park(struct io_sq_data *sqd)
+{
+ return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+}
+
+static void io_sq_thread_parkme(struct io_sq_data *sqd)
+{
+ for (;;) {
+ /*
+ * TASK_PARKED is a special state; we must serialize against
+ * possible pending wakeups to avoid store-store collisions on
+ * task->state.
+ *
+ * Such a collision might possibly result in the task state
+ * changin from TASK_PARKED and us failing the
+ * wait_task_inactive() in kthread_park().
+ */
+ set_special_state(TASK_PARKED);
+ if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
+ break;
+
+ /*
+ * Thread is going to call schedule(), do not preempt it,
+ * or the caller of kthread_park() may spend more time in
+ * wait_task_inactive().
+ */
+ preempt_disable();
+ complete(&sqd->parked);
+ schedule_preempt_disabled();
+ preempt_enable();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
static int io_sq_thread(void *data)
{
- struct cgroup_subsys_state *cur_css = NULL;
- struct files_struct *old_files = current->files;
- struct nsproxy *old_nsproxy = current->nsproxy;
- const struct cred *old_cred = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
unsigned long timeout = 0;
+ char buf[TASK_COMM_LEN];
DEFINE_WAIT(wait);
- task_lock(current);
- current->files = NULL;
- current->nsproxy = NULL;
- task_unlock(current);
+ sprintf(buf, "iou-sqp-%d", sqd->task_pid);
+ set_task_comm(current, buf);
+ current->pf_io_worker = NULL;
+
+ if (sqd->sq_cpu != -1)
+ set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
+ else
+ set_cpus_allowed_ptr(current, cpu_online_mask);
+ current->flags |= PF_NO_SETAFFINITY;
- while (!kthread_should_stop()) {
+ wait_for_completion(&sqd->startup);
+
+ while (!io_sq_thread_should_stop(sqd)) {
int ret;
bool cap_entries, sqt_spin, needs_sched;
/*
* Any changes to the sqd lists are synchronized through the
- * kthread parking. This synchronizes the thread vs users,
+ * thread parking. This synchronizes the thread vs users,
* the users are synchronized on the sqd->ctx_lock.
*/
- if (kthread_should_park()) {
- kthread_parkme();
- /*
- * When sq thread is unparked, in case the previous park operation
- * comes from io_put_sq_data(), which means that sq thread is going
- * to be stopped, so here needs to have a check.
- */
- if (kthread_should_stop())
- break;
+ if (io_sq_thread_should_park(sqd)) {
+ io_sq_thread_parkme(sqd);
+ continue;
}
-
if (unlikely(!list_empty(&sqd->ctx_new_list))) {
io_sqd_init_new(sqd);
timeout = jiffies + sqd->sq_thread_idle;
}
-
+ if (fatal_signal_pending(current))
+ break;
sqt_spin = false;
cap_entries = !list_is_singular(&sqd->ctx_list);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- if (current->cred != ctx->creds) {
- if (old_cred)
- revert_creds(old_cred);
- old_cred = override_creds(ctx->creds);
- }
- io_sq_thread_associate_blkcg(ctx, &cur_css);
-#ifdef CONFIG_AUDIT
- current->loginuid = ctx->loginuid;
- current->sessionid = ctx->sessionid;
-#endif
-
ret = __io_sq_thread(ctx, cap_entries);
if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
sqt_spin = true;
-
- io_sq_thread_drop_mm_files();
}
if (sqt_spin || !time_after(jiffies, timeout)) {
io_run_task_work();
- io_sq_thread_drop_mm_files();
cond_resched();
if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle;
@@ -7115,11 +6732,12 @@ static int io_sq_thread(void *data)
}
}
- if (needs_sched && !kthread_should_park()) {
+ if (needs_sched && !io_sq_thread_should_park(sqd)) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
schedule();
+ try_to_freeze();
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_clear_wakeup_flag(ctx);
}
@@ -7128,22 +6746,32 @@ static int io_sq_thread(void *data)
timeout = jiffies + sqd->sq_thread_idle;
}
- io_run_task_work();
- io_sq_thread_drop_mm_files();
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_uring_cancel_sqpoll(ctx);
- if (cur_css)
- io_sq_thread_unassociate_blkcg();
- if (old_cred)
- revert_creds(old_cred);
+ io_run_task_work();
- task_lock(current);
- current->files = old_files;
- current->nsproxy = old_nsproxy;
- task_unlock(current);
+ /*
+ * Ensure that we park properly if racing with someone trying to park
+ * while we're exiting. If we fail to grab the lock, check park and
+ * park if necessary. The ordering with the park bit and the lock
+ * ensures that we catch this reliably.
+ */
+ if (!mutex_trylock(&sqd->lock)) {
+ if (io_sq_thread_should_park(sqd))
+ io_sq_thread_parkme(sqd);
+ mutex_lock(&sqd->lock);
+ }
- kthread_parkme();
+ sqd->thread = NULL;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ ctx->sqo_exec = 1;
+ io_ring_set_wakeup_flag(ctx);
+ }
- return 0;
+ complete(&sqd->exited);
+ mutex_unlock(&sqd->lock);
+ do_exit(0);
}
struct io_wait_queue {
@@ -7264,11 +6892,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
trace_io_uring_cqring_wait(ctx, min_events);
do {
- io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ /* if we can't even flush overflow, don't wait for more */
+ if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
+ ret = -EBUSY;
+ break;
+ }
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
finish_wait(&ctx->wait, &iowq.wq);
+ cond_resched();
} while (ret > 0);
restore_saved_sigmask_unless(ret == -EINTR);
@@ -7328,38 +6961,59 @@ static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
percpu_ref_get(&rsrc_data->refs);
}
-static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
- struct io_ring_ctx *ctx,
- struct fixed_rsrc_ref_node *backup_node)
+static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
{
- struct fixed_rsrc_ref_node *ref_node;
- int ret;
+ struct fixed_rsrc_ref_node *ref_node = NULL;
io_rsrc_ref_lock(ctx);
ref_node = data->node;
+ data->node = NULL;
io_rsrc_ref_unlock(ctx);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
+}
+
+static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
+ struct io_ring_ctx *ctx,
+ void (*rsrc_put)(struct io_ring_ctx *ctx,
+ struct io_rsrc_put *prsrc))
+{
+ struct fixed_rsrc_ref_node *backup_node;
+ int ret;
- percpu_ref_kill(&data->refs);
+ if (data->quiesce)
+ return -ENXIO;
- /* wait for all refs nodes to complete */
- flush_delayed_work(&ctx->rsrc_put_work);
+ data->quiesce = true;
do {
+ ret = -ENOMEM;
+ backup_node = alloc_fixed_rsrc_ref_node(ctx);
+ if (!backup_node)
+ break;
+ backup_node->rsrc_data = data;
+ backup_node->rsrc_put = rsrc_put;
+
+ io_sqe_rsrc_kill_node(ctx, data);
+ percpu_ref_kill(&data->refs);
+ flush_delayed_work(&ctx->rsrc_put_work);
+
ret = wait_for_completion_interruptible(&data->done);
if (!ret)
break;
+
+ percpu_ref_resurrect(&data->refs);
+ io_sqe_rsrc_set_node(ctx, data, backup_node);
+ backup_node = NULL;
+ reinit_completion(&data->done);
+ mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig();
- if (ret < 0) {
- percpu_ref_resurrect(&data->refs);
- reinit_completion(&data->done);
- io_sqe_rsrc_set_node(ctx, data, backup_node);
- return ret;
- }
- } while (1);
+ mutex_lock(&ctx->uring_lock);
+ } while (ret >= 0);
+ data->quiesce = false;
- destroy_fixed_rsrc_ref_node(backup_node);
- return 0;
+ if (backup_node)
+ destroy_fixed_rsrc_ref_node(backup_node);
+ return ret;
}
static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
@@ -7390,18 +7044,17 @@ static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_rsrc_data *data = ctx->file_data;
- struct fixed_rsrc_ref_node *backup_node;
unsigned nr_tables, i;
int ret;
- if (!data)
+ /*
+ * percpu_ref_is_dying() is to stop parallel files unregister
+ * Since we possibly drop uring lock later in this function to
+ * run task work.
+ */
+ if (!data || percpu_ref_is_dying(&data->refs))
return -ENXIO;
- backup_node = alloc_fixed_rsrc_ref_node(ctx);
- if (!backup_node)
- return -ENOMEM;
- init_fixed_file_ref_node(ctx, backup_node);
-
- ret = io_rsrc_ref_quiesce(data, ctx, backup_node);
+ ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
if (ret)
return ret;
@@ -7415,20 +7068,76 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
return 0;
}
+static void io_sq_thread_unpark(struct io_sq_data *sqd)
+ __releases(&sqd->lock)
+{
+ if (sqd->thread == current)
+ return;
+ clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+ if (sqd->thread)
+ wake_up_state(sqd->thread, TASK_PARKED);
+ mutex_unlock(&sqd->lock);
+}
+
+static void io_sq_thread_park(struct io_sq_data *sqd)
+ __acquires(&sqd->lock)
+{
+ if (sqd->thread == current)
+ return;
+ set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+ mutex_lock(&sqd->lock);
+ if (sqd->thread) {
+ wake_up_process(sqd->thread);
+ wait_for_completion(&sqd->parked);
+ }
+}
+
+static void io_sq_thread_stop(struct io_sq_data *sqd)
+{
+ if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state))
+ return;
+ mutex_lock(&sqd->lock);
+ if (sqd->thread) {
+ set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
+ wake_up_process(sqd->thread);
+ mutex_unlock(&sqd->lock);
+ wait_for_completion(&sqd->exited);
+ WARN_ON_ONCE(sqd->thread);
+ } else {
+ mutex_unlock(&sqd->lock);
+ }
+}
+
static void io_put_sq_data(struct io_sq_data *sqd)
{
if (refcount_dec_and_test(&sqd->refs)) {
- /*
- * The park is a bit of a work-around, without it we get
- * warning spews on shutdown with SQPOLL set and affinity
- * set to a single CPU.
- */
+ io_sq_thread_stop(sqd);
+ kfree(sqd);
+ }
+}
+
+static void io_sq_thread_finish(struct io_ring_ctx *ctx)
+{
+ struct io_sq_data *sqd = ctx->sq_data;
+
+ if (sqd) {
+ complete(&sqd->startup);
if (sqd->thread) {
- kthread_park(sqd->thread);
- kthread_stop(sqd->thread);
+ wait_for_completion(&ctx->sq_thread_comp);
+ io_sq_thread_park(sqd);
}
- kfree(sqd);
+ mutex_lock(&sqd->ctx_lock);
+ list_del(&ctx->sqd_list);
+ io_sqd_update_thread_idle(sqd);
+ mutex_unlock(&sqd->ctx_lock);
+
+ if (sqd->thread)
+ io_sq_thread_unpark(sqd);
+
+ io_put_sq_data(sqd);
+ ctx->sq_data = NULL;
}
}
@@ -7475,68 +7184,12 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
mutex_init(&sqd->ctx_lock);
mutex_init(&sqd->lock);
init_waitqueue_head(&sqd->wait);
+ init_completion(&sqd->startup);
+ init_completion(&sqd->parked);
+ init_completion(&sqd->exited);
return sqd;
}
-static void io_sq_thread_unpark(struct io_sq_data *sqd)
- __releases(&sqd->lock)
-{
- if (!sqd->thread)
- return;
- kthread_unpark(sqd->thread);
- mutex_unlock(&sqd->lock);
-}
-
-static void io_sq_thread_park(struct io_sq_data *sqd)
- __acquires(&sqd->lock)
-{
- if (!sqd->thread)
- return;
- mutex_lock(&sqd->lock);
- kthread_park(sqd->thread);
-}
-
-static void io_sq_thread_stop(struct io_ring_ctx *ctx)
-{
- struct io_sq_data *sqd = ctx->sq_data;
-
- if (sqd) {
- if (sqd->thread) {
- /*
- * We may arrive here from the error branch in
- * io_sq_offload_create() where the kthread is created
- * without being waked up, thus wake it up now to make
- * sure the wait will complete.
- */
- wake_up_process(sqd->thread);
- wait_for_completion(&ctx->sq_thread_comp);
-
- io_sq_thread_park(sqd);
- }
-
- mutex_lock(&sqd->ctx_lock);
- list_del(&ctx->sqd_list);
- io_sqd_update_thread_idle(sqd);
- mutex_unlock(&sqd->ctx_lock);
-
- if (sqd->thread)
- io_sq_thread_unpark(sqd);
-
- io_put_sq_data(sqd);
- ctx->sq_data = NULL;
- }
-}
-
-static void io_finish_async(struct io_ring_ctx *ctx)
-{
- io_sq_thread_stop(ctx);
-
- if (ctx->io_wq) {
- io_wq_destroy(ctx->io_wq);
- ctx->io_wq = NULL;
- }
-}
-
#if defined(CONFIG_UNIX)
/*
* Ensure the UNIX gc is aware of our file set, so we are certain that
@@ -7563,7 +7216,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
skb->sk = sk;
nr_files = 0;
- fpl->user = get_uid(ctx->user);
+ fpl->user = get_uid(current_user());
for (i = 0; i < nr; i++) {
struct file *file = io_file_from_index(ctx, i + offset);
@@ -8095,54 +7748,34 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
return req ? &req->work : NULL;
}
-static int io_init_wq_offload(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
{
+ struct io_wq_hash *hash;
struct io_wq_data data;
- struct fd f;
- struct io_ring_ctx *ctx_attach;
unsigned int concurrency;
- int ret = 0;
-
- data.user = ctx->user;
- data.free_work = io_free_work;
- data.do_work = io_wq_submit_work;
- if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
- /* Do QD, or 4 * CPUS, whatever is smallest */
- concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-
- ctx->io_wq = io_wq_create(concurrency, &data);
- if (IS_ERR(ctx->io_wq)) {
- ret = PTR_ERR(ctx->io_wq);
- ctx->io_wq = NULL;
- }
- return ret;
+ hash = ctx->hash_map;
+ if (!hash) {
+ hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+ if (!hash)
+ return ERR_PTR(-ENOMEM);
+ refcount_set(&hash->refs, 1);
+ init_waitqueue_head(&hash->wait);
+ ctx->hash_map = hash;
}
- f = fdget(p->wq_fd);
- if (!f.file)
- return -EBADF;
-
- if (f.file->f_op != &io_uring_fops) {
- ret = -EINVAL;
- goto out_fput;
- }
+ data.hash = hash;
+ data.free_work = io_free_work;
+ data.do_work = io_wq_submit_work;
- ctx_attach = f.file->private_data;
- /* @io_wq is protected by holding the fd */
- if (!io_wq_get(ctx_attach->io_wq, &data)) {
- ret = -EINVAL;
- goto out_fput;
- }
+ /* Do QD, or 4 * CPUS, whatever is smallest */
+ concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
- ctx->io_wq = ctx_attach->io_wq;
-out_fput:
- fdput(f);
- return ret;
+ return io_wq_create(concurrency, &data);
}
-static int io_uring_alloc_task_context(struct task_struct *task)
+static int io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
@@ -8157,13 +7790,19 @@ static int io_uring_alloc_task_context(struct task_struct *task)
return ret;
}
+ tctx->io_wq = io_init_wq_offload(ctx);
+ if (IS_ERR(tctx->io_wq)) {
+ ret = PTR_ERR(tctx->io_wq);
+ percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx);
+ return ret;
+ }
+
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
tctx->sqpoll = false;
- io_init_identity(&tctx->__identity);
- tctx->identity = &tctx->__identity;
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
@@ -8177,20 +7816,54 @@ void __io_uring_free(struct task_struct *tsk)
struct io_uring_task *tctx = tsk->io_uring;
WARN_ON_ONCE(!xa_empty(&tctx->xa));
- WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
- if (tctx->identity != &tctx->__identity)
- kfree(tctx->identity);
+ WARN_ON_ONCE(tctx->io_wq);
+
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
}
+static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
+{
+ struct task_struct *tsk;
+ int ret;
+
+ clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ reinit_completion(&sqd->parked);
+ ctx->sqo_exec = 0;
+ sqd->task_pid = current->pid;
+ tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+ if (IS_ERR(tsk))
+ return PTR_ERR(tsk);
+ ret = io_uring_alloc_task_context(tsk, ctx);
+ if (ret)
+ set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ sqd->thread = tsk;
+ wake_up_new_task(tsk);
+ return ret;
+}
+
static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
int ret;
+ /* Retain compatibility with failing for an invalid attach attempt */
+ if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
+ IORING_SETUP_ATTACH_WQ) {
+ struct fd f;
+
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return -ENXIO;
+ if (f.file->f_op != &io_uring_fops) {
+ fdput(f);
+ return -EINVAL;
+ }
+ fdput(f);
+ }
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct task_struct *tsk;
struct io_sq_data *sqd;
ret = -EPERM;
@@ -8215,7 +7888,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
ctx->sq_thread_idle = HZ;
if (sqd->thread)
- goto done;
+ return 0;
if (p->flags & IORING_SETUP_SQ_AFF) {
int cpu = p->sq_thread_cpu;
@@ -8226,18 +7899,22 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
if (!cpu_online(cpu))
goto err;
- sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
- cpu, "io_uring-sq");
+ sqd->sq_cpu = cpu;
} else {
- sqd->thread = kthread_create(io_sq_thread, sqd,
- "io_uring-sq");
+ sqd->sq_cpu = -1;
}
- if (IS_ERR(sqd->thread)) {
- ret = PTR_ERR(sqd->thread);
- sqd->thread = NULL;
+
+ sqd->task_pid = current->pid;
+ tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+ if (IS_ERR(tsk)) {
+ ret = PTR_ERR(tsk);
goto err;
}
- ret = io_uring_alloc_task_context(sqd->thread);
+ ret = io_uring_alloc_task_context(tsk, ctx);
+ if (ret)
+ set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ sqd->thread = tsk;
+ wake_up_new_task(tsk);
if (ret)
goto err;
} else if (p->flags & IORING_SETUP_SQ_AFF) {
@@ -8246,14 +7923,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
goto err;
}
-done:
- ret = io_init_wq_offload(ctx, p);
- if (ret)
- goto err;
-
return 0;
err:
- io_finish_async(ctx);
+ io_sq_thread_finish(ctx);
return ret;
}
@@ -8261,8 +7933,9 @@ static void io_sq_offload_start(struct io_ring_ctx *ctx)
{
struct io_sq_data *sqd = ctx->sq_data;
- if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
- wake_up_process(sqd->thread);
+ ctx->flags &= ~IORING_SETUP_R_DISABLED;
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ complete(&sqd->startup);
}
static inline void __io_unaccount_mem(struct user_struct *user,
@@ -8292,7 +7965,7 @@ static inline int __io_account_mem(struct user_struct *user,
static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
- if (ctx->limit_mem)
+ if (ctx->user)
__io_unaccount_mem(ctx->user, nr_pages);
if (ctx->mm_account)
@@ -8303,7 +7976,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
int ret;
- if (ctx->limit_mem) {
+ if (ctx->user) {
ret = __io_account_mem(ctx->user, nr_pages);
if (ret)
return ret;
@@ -8699,22 +8372,26 @@ static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
}
}
-static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static void io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_submit_state *submit_state = &ctx->submit_state;
+ struct io_comp_state *cs = &ctx->submit_state.comp;
mutex_lock(&ctx->uring_lock);
- if (submit_state->free_reqs)
+ if (submit_state->free_reqs) {
kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
submit_state->reqs);
-
- io_req_cache_free(&submit_state->comp.free_list, NULL);
+ submit_state->free_reqs = 0;
+ }
spin_lock_irq(&ctx->completion_lock);
- io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
+ list_splice_init(&cs->locked_free_list, &cs->free_list);
+ cs->locked_free_nr = 0;
spin_unlock_irq(&ctx->completion_lock);
+ io_req_cache_free(&cs->free_list, NULL);
+
mutex_unlock(&ctx->uring_lock);
}
@@ -8728,22 +8405,17 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
mutex_lock(&ctx->uring_lock);
mutex_unlock(&ctx->uring_lock);
- io_finish_async(ctx);
+ io_sq_thread_finish(ctx);
io_sqe_buffers_unregister(ctx);
- if (ctx->sqo_task) {
- put_task_struct(ctx->sqo_task);
- ctx->sqo_task = NULL;
+ if (ctx->mm_account) {
mmdrop(ctx->mm_account);
ctx->mm_account = NULL;
}
-#ifdef CONFIG_BLK_CGROUP
- if (ctx->sqo_blkcg_css)
- css_put(ctx->sqo_blkcg_css);
-#endif
-
+ mutex_lock(&ctx->uring_lock);
io_sqe_files_unregister(ctx);
+ mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
idr_destroy(&ctx->personality_idr);
@@ -8760,8 +8432,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
- put_cred(ctx->creds);
- io_req_caches_free(ctx, NULL);
+ io_req_caches_free(ctx);
+ if (ctx->hash_map)
+ io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_hash);
kfree(ctx);
}
@@ -8808,13 +8481,11 @@ static int io_uring_fasync(int fd, struct file *file, int on)
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
- struct io_identity *iod;
+ const struct cred *creds;
- iod = idr_remove(&ctx->personality_idr, id);
- if (iod) {
- put_cred(iod->creds);
- if (refcount_dec_and_test(&iod->count))
- kfree(iod);
+ creds = idr_remove(&ctx->personality_idr, id);
+ if (creds) {
+ put_cred(creds);
return 0;
}
@@ -8829,6 +8500,28 @@ static int io_remove_personalities(int id, void *p, void *data)
return 0;
}
+static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
+{
+ struct callback_head *work, *next;
+ bool executed = false;
+
+ do {
+ work = xchg(&ctx->exit_task_work, NULL);
+ if (!work)
+ break;
+
+ do {
+ next = work->next;
+ work->func(work);
+ work = next;
+ cond_resched();
+ } while (work);
+ executed = true;
+ } while (1);
+
+ return executed;
+}
+
static void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
@@ -8846,21 +8539,10 @@ static void io_ring_exit_work(struct work_struct *work)
io_ring_ctx_free(ctx);
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- return req->ctx == data;
-}
-
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
-
- if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead))
- ctx->sqo_dead = 1;
-
/* if force is set, the ring is going away. always drop after that */
ctx->cq_overflow_flushed = 1;
if (ctx->rings)
@@ -8871,9 +8553,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_kill_timeouts(ctx, NULL, NULL);
io_poll_remove_all(ctx, NULL, NULL);
- if (ctx->io_wq)
- io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
-
/* if we failed setting up the ctx, we might not have any rings */
io_iopoll_try_reap_events(ctx);
@@ -8952,13 +8631,15 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct files_struct *files)
{
struct io_task_cancel cancel = { .task = task, .files = files, };
+ struct task_struct *tctx_task = task ?: current;
+ struct io_uring_task *tctx = tctx_task->io_uring;
while (1) {
enum io_wq_cancel cret;
bool ret = false;
- if (ctx->io_wq) {
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+ if (tctx && tctx->io_wq) {
+ cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
&cancel, true);
ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
}
@@ -8974,6 +8655,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_poll_remove_all(ctx, task, files);
ret |= io_kill_timeouts(ctx, task, files);
ret |= io_run_task_work();
+ ret |= io_run_ctx_fallback(ctx);
io_cqring_overflow_flush(ctx, true, task, files);
if (!ret)
break;
@@ -9021,17 +8703,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
}
}
-static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
-{
- mutex_lock(&ctx->uring_lock);
- ctx->sqo_dead = 1;
- mutex_unlock(&ctx->uring_lock);
-
- /* make sure callers enter the ring to get error */
- if (ctx->rings)
- io_ring_set_wakeup_flag(ctx);
-}
-
/*
* We need to iteratively cancel requests, in case a request has dependent
* hard links. These persist even for failure of cancelations, hence keep
@@ -9043,10 +8714,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
struct task_struct *task = current;
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
- io_disable_sqo_submit(ctx);
- task = ctx->sq_data->thread;
- atomic_inc(&task->io_uring->in_idle);
+ /* never started, nothing to cancel */
+ if (ctx->flags & IORING_SETUP_R_DISABLED) {
+ io_sq_offload_start(ctx);
+ return;
+ }
io_sq_thread_park(ctx->sq_data);
+ task = ctx->sq_data->thread;
+ if (task)
+ atomic_inc(&task->io_uring->in_idle);
}
io_cancel_defer_files(ctx, task, files);
@@ -9055,10 +8731,10 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
if (!files)
io_uring_try_cancel_requests(ctx, task, NULL);
- if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
+ if (task)
atomic_dec(&task->io_uring->in_idle);
+ if (ctx->sq_data)
io_sq_thread_unpark(ctx->sq_data);
- }
}
/*
@@ -9070,7 +8746,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
int ret;
if (unlikely(!tctx)) {
- ret = io_uring_alloc_task_context(current);
+ ret = io_uring_alloc_task_context(current, ctx);
if (unlikely(ret))
return ret;
tctx = current->io_uring;
@@ -9086,10 +8762,6 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
fput(file);
return ret;
}
-
- /* one and only SQPOLL file note, held by sqo_task */
- WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) &&
- current != ctx->sqo_task);
}
tctx->last = file;
}
@@ -9119,13 +8791,17 @@ static void io_uring_del_task_file(struct file *file)
fput(file);
}
-static void io_uring_remove_task_files(struct io_uring_task *tctx)
+static void io_uring_clean_tctx(struct io_uring_task *tctx)
{
struct file *file;
unsigned long index;
xa_for_each(&tctx->xa, index, file)
io_uring_del_task_file(file);
+ if (tctx->io_wq) {
+ io_wq_put_and_exit(tctx->io_wq);
+ tctx->io_wq = NULL;
+ }
}
void __io_uring_files_cancel(struct files_struct *files)
@@ -9141,7 +8817,7 @@ void __io_uring_files_cancel(struct files_struct *files)
atomic_dec(&tctx->in_idle);
if (files)
- io_uring_remove_task_files(tctx);
+ io_uring_clean_tctx(tctx);
}
static s64 tctx_inflight(struct io_uring_task *tctx)
@@ -9151,15 +8827,19 @@ static s64 tctx_inflight(struct io_uring_task *tctx)
static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
{
+ struct io_sq_data *sqd = ctx->sq_data;
struct io_uring_task *tctx;
s64 inflight;
DEFINE_WAIT(wait);
- if (!ctx->sq_data)
+ if (!sqd)
+ return;
+ io_sq_thread_park(sqd);
+ if (!sqd->thread || !sqd->thread->io_uring) {
+ io_sq_thread_unpark(sqd);
return;
+ }
tctx = ctx->sq_data->thread->io_uring;
- io_disable_sqo_submit(ctx);
-
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
@@ -9179,6 +8859,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
finish_wait(&tctx->wait, &wait);
} while (1);
atomic_dec(&tctx->in_idle);
+ io_sq_thread_unpark(sqd);
}
/*
@@ -9194,7 +8875,6 @@ void __io_uring_task_cancel(void)
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
- /* trigger io_disable_sqo_submit() */
if (tctx->sqpoll) {
struct file *file;
unsigned long index;
@@ -9224,47 +8904,9 @@ void __io_uring_task_cancel(void)
atomic_dec(&tctx->in_idle);
- io_uring_remove_task_files(tctx);
-}
-
-static int io_uring_flush(struct file *file, void *data)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_ring_ctx *ctx = file->private_data;
-
- if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
- io_uring_cancel_task_requests(ctx, NULL);
- io_req_caches_free(ctx, current);
- }
-
- if (!tctx)
- return 0;
-
- /* we should have cancelled and erased it before PF_EXITING */
- WARN_ON_ONCE((current->flags & PF_EXITING) &&
- xa_load(&tctx->xa, (unsigned long)file));
-
- /*
- * fput() is pending, will be 2 if the only other ref is our potential
- * task file note. If the task is exiting, drop regardless of count.
- */
- if (atomic_long_read(&file->f_count) != 2)
- return 0;
-
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- /* there is only one file note, which is owned by sqo_task */
- WARN_ON_ONCE(ctx->sqo_task != current &&
- xa_load(&tctx->xa, (unsigned long)file));
- /* sqo_dead check is for when this happens after cancellation */
- WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead &&
- !xa_load(&tctx->xa, (unsigned long)file));
-
- io_disable_sqo_submit(ctx);
- }
-
- if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current)
- io_uring_del_task_file(file);
- return 0;
+ io_uring_clean_tctx(tctx);
+ /* all current's requests should be gone, we can kill tctx */
+ __io_uring_free(current);
}
static void *io_uring_validate_mmap_request(struct file *file,
@@ -9345,22 +8987,14 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
do {
if (!io_sqring_full(ctx))
break;
-
prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
- if (unlikely(ctx->sqo_dead)) {
- ret = -EOWNERDEAD;
- goto out;
- }
-
if (!io_sqring_full(ctx))
break;
-
schedule();
} while (!signal_pending(current));
finish_wait(&ctx->sqo_sq_wait, &wait);
-out:
return ret;
}
@@ -9435,9 +9069,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (ctx->flags & IORING_SETUP_SQPOLL) {
io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ if (unlikely(ctx->sqo_exec)) {
+ ret = io_sq_thread_fork(ctx->sq_data, ctx);
+ if (ret)
+ goto out;
+ ctx->sqo_exec = 0;
+ }
ret = -EOWNERDEAD;
- if (unlikely(ctx->sqo_dead))
- goto out;
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sq_data->wait);
if (flags & IORING_ENTER_SQ_WAIT) {
@@ -9491,8 +9129,7 @@ out_fput:
#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
- struct io_identity *iod = p;
- const struct cred *cred = iod->creds;
+ const struct cred *cred = p;
struct seq_file *m = data;
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
@@ -9537,8 +9174,11 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
*/
has_lock = mutex_trylock(&ctx->uring_lock);
- if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
+ if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
sq = ctx->sq_data;
+ if (!sq->thread)
+ sq = NULL;
+ }
seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
@@ -9590,7 +9230,6 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
- .flush = io_uring_flush,
.mmap = io_uring_mmap,
#ifndef CONFIG_MMU
.get_unmapped_area = io_uring_nommu_get_unmapped_area,
@@ -9698,7 +9337,6 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
static int io_uring_create(unsigned entries, struct io_uring_params *p,
struct io_uring_params __user *params)
{
- struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
struct file *file;
int ret;
@@ -9740,22 +9378,12 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->cq_entries = 2 * p->sq_entries;
}
- user = get_uid(current_user());
-
ctx = io_ring_ctx_alloc(p);
- if (!ctx) {
- free_uid(user);
+ if (!ctx)
return -ENOMEM;
- }
ctx->compat = in_compat_syscall();
- ctx->limit_mem = !capable(CAP_IPC_LOCK);
- ctx->user = user;
- ctx->creds = get_current_cred();
-#ifdef CONFIG_AUDIT
- ctx->loginuid = current->loginuid;
- ctx->sessionid = current->sessionid;
-#endif
- ctx->sqo_task = get_task_struct(current);
+ if (!capable(CAP_IPC_LOCK))
+ ctx->user = get_uid(current_user());
/*
* This is just grabbed for accounting purposes. When a process exits,
@@ -9766,24 +9394,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
mmgrab(current->mm);
ctx->mm_account = current->mm;
-#ifdef CONFIG_BLK_CGROUP
- /*
- * The sq thread will belong to the original cgroup it was inited in.
- * If the cgroup goes offline (e.g. disabling the io controller), then
- * issued bios will be associated with the closest cgroup later in the
- * block layer.
- */
- rcu_read_lock();
- ctx->sqo_blkcg_css = blkcg_css();
- ret = css_tryget_online(ctx->sqo_blkcg_css);
- rcu_read_unlock();
- if (!ret) {
- /* don't init against a dying cgroup, have the user try again */
- ctx->sqo_blkcg_css = NULL;
- ret = -ENODEV;
- goto err;
- }
-#endif
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
@@ -9817,7 +9427,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
- IORING_FEAT_EXT_ARG;
+ IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -9836,7 +9446,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
*/
ret = io_uring_install_fd(ctx, file);
if (ret < 0) {
- io_disable_sqo_submit(ctx);
/* fput will clean it up */
fput(file);
return ret;
@@ -9845,7 +9454,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
- io_disable_sqo_submit(ctx);
io_ring_ctx_wait_and_kill(ctx);
return ret;
}
@@ -9923,21 +9531,15 @@ out:
static int io_register_personality(struct io_ring_ctx *ctx)
{
- struct io_identity *id;
+ const struct cred *creds;
int ret;
- id = kmalloc(sizeof(*id), GFP_KERNEL);
- if (unlikely(!id))
- return -ENOMEM;
-
- io_init_identity(id);
- id->creds = get_current_cred();
+ creds = get_current_cred();
- ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
- if (ret < 0) {
- put_cred(id->creds);
- kfree(id);
- }
+ ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+ USHRT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ put_cred(creds);
return ret;
}
@@ -10019,10 +9621,7 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
if (ctx->restrictions.registered)
ctx->restricted = 1;
- ctx->flags &= ~IORING_SETUP_R_DISABLED;
-
io_sq_offload_start(ctx);
-
return 0;
}
@@ -10196,6 +9795,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
ctx = f.file->private_data;
+ io_run_task_work();
+
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 16a1e82e3aeb..7ffcd7ef33d4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -278,14 +278,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (!is_contig || bio_full(ctx->bio, plen)) {
gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
gfp_t orig_gfp = gfp;
- int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
if (ctx->bio)
submit_bio(ctx->bio);
if (ctx->rac) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+ ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
/*
* If the bio_alloc fails, try it again for a single page to
* avoid having to deal with partial page reads. This emulates
@@ -1459,13 +1459,6 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
goto redirty;
/*
- * Given that we do not allow direct reclaim to call us, we should
- * never be called in a recursive filesystem reclaim context.
- */
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
- goto redirty;
-
- /*
* Is this page beyond the end of the file?
*
* The page index is less than the end_index, adjust the end_offset
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index 107ee80c3568..dab1b02eba5b 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -10,122 +10,17 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
-/*
- * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
- * Returns true if found and updates @lastoff to the offset in file.
- */
-static bool
-page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
- int whence)
-{
- const struct address_space_operations *ops = inode->i_mapping->a_ops;
- unsigned int bsize = i_blocksize(inode), off;
- bool seek_data = whence == SEEK_DATA;
- loff_t poff = page_offset(page);
-
- if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
- return false;
-
- if (*lastoff < poff) {
- /*
- * Last offset smaller than the start of the page means we found
- * a hole:
- */
- if (whence == SEEK_HOLE)
- return true;
- *lastoff = poff;
- }
-
- /*
- * Just check the page unless we can and should check block ranges:
- */
- if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
- return PageUptodate(page) == seek_data;
-
- lock_page(page);
- if (unlikely(page->mapping != inode->i_mapping))
- goto out_unlock_not_found;
-
- for (off = 0; off < PAGE_SIZE; off += bsize) {
- if (offset_in_page(*lastoff) >= off + bsize)
- continue;
- if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
- unlock_page(page);
- return true;
- }
- *lastoff = poff + off + bsize;
- }
-
-out_unlock_not_found:
- unlock_page(page);
- return false;
-}
-
-/*
- * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
- *
- * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: uptodate buffer heads count as data; everything else
- * counts as a hole.
- *
- * Returns the resulting offset on successs, and -ENOENT otherwise.
- */
static loff_t
-page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
- int whence)
-{
- pgoff_t index = offset >> PAGE_SHIFT;
- pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
- loff_t lastoff = offset;
- struct pagevec pvec;
-
- if (length <= 0)
- return -ENOENT;
-
- pagevec_init(&pvec);
-
- do {
- unsigned nr_pages, i;
-
- nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
- end - 1);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
-
- if (page_seek_hole_data(inode, page, &lastoff, whence))
- goto check_range;
- lastoff = page_offset(page) + PAGE_SIZE;
- }
- pagevec_release(&pvec);
- } while (index < end);
-
- /* When no page at lastoff and we are not done, we found a hole. */
- if (whence != SEEK_HOLE)
- goto not_found;
-
-check_range:
- if (lastoff < offset + length)
- goto out;
-not_found:
- lastoff = -ENOENT;
-out:
- pagevec_release(&pvec);
- return lastoff;
-}
-
-
-static loff_t
-iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
+iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length,
void *data, struct iomap *iomap, struct iomap *srcmap)
{
+ loff_t offset = start;
+
switch (iomap->type) {
case IOMAP_UNWRITTEN:
- offset = page_cache_seek_hole_data(inode, offset, length,
- SEEK_HOLE);
- if (offset < 0)
+ offset = mapping_seek_hole_data(inode->i_mapping, start,
+ start + length, SEEK_HOLE);
+ if (offset == start + length)
return length;
fallthrough;
case IOMAP_HOLE:
@@ -164,15 +59,17 @@ iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops)
EXPORT_SYMBOL_GPL(iomap_seek_hole);
static loff_t
-iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
+iomap_seek_data_actor(struct inode *inode, loff_t start, loff_t length,
void *data, struct iomap *iomap, struct iomap *srcmap)
{
+ loff_t offset = start;
+
switch (iomap->type) {
case IOMAP_HOLE:
return length;
case IOMAP_UNWRITTEN:
- offset = page_cache_seek_hole_data(inode, offset, length,
- SEEK_DATA);
+ offset = mapping_seek_hole_data(inode->i_mapping, start,
+ start + length, SEEK_DATA);
if (offset < 0)
return length;
fallthrough;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b2dc4d1f9dcc..1f0ffabbde56 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -551,7 +551,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
ret = -ENOMEM;
goto out_unload;
}
- inode->i_ino = 0;
inode->i_size = i_size_read(sb->s_bdev->bd_inode);
inode->i_mapping->a_ops = &jfs_metapage_aops;
inode_fake_hash(inode);
diff --git a/fs/mpage.c b/fs/mpage.c
index 830e6cc2a9e7..961234d68779 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -304,9 +304,7 @@ alloc_new:
goto out;
}
args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- min_t(int, args->nr_pages,
- BIO_MAX_PAGES),
- gfp);
+ bio_max_segs(args->nr_pages), gfp);
if (args->bio == NULL)
goto confused;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 062fe984a697..56bb5a5fdc0d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -381,50 +381,36 @@ int mnt_want_write(struct vfsmount *m)
EXPORT_SYMBOL_GPL(mnt_want_write);
/**
- * mnt_clone_write - get write access to a mount
- * @mnt: the mount on which to take a write
- *
- * This is effectively like mnt_want_write, except
- * it must only be used to take an extra write reference
- * on a mountpoint that we already know has a write reference
- * on it. This allows some optimisation.
- *
- * After finished, mnt_drop_write must be called as usual to
- * drop the reference.
- */
-int mnt_clone_write(struct vfsmount *mnt)
-{
- /* superblock may be r/o */
- if (__mnt_is_readonly(mnt))
- return -EROFS;
- preempt_disable();
- mnt_inc_writers(real_mount(mnt));
- preempt_enable();
- return 0;
-}
-EXPORT_SYMBOL_GPL(mnt_clone_write);
-
-/**
* __mnt_want_write_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like __mnt_want_write, but it takes a file and can
- * do some optimisations if the file is open for write already
+ * This is like __mnt_want_write, but if the file is already open for writing it
+ * skips incrementing mnt_writers (since the open file already has a reference)
+ * and instead only does the check for emergency r/o remounts. This must be
+ * paired with __mnt_drop_write_file.
*/
int __mnt_want_write_file(struct file *file)
{
- if (!(file->f_mode & FMODE_WRITER))
- return __mnt_want_write(file->f_path.mnt);
- else
- return mnt_clone_write(file->f_path.mnt);
+ if (file->f_mode & FMODE_WRITER) {
+ /*
+ * Superblock may have become readonly while there are still
+ * writable fd's, e.g. due to a fs error with errors=remount-ro
+ */
+ if (__mnt_is_readonly(file->f_path.mnt))
+ return -EROFS;
+ return 0;
+ }
+ return __mnt_want_write(file->f_path.mnt);
}
/**
* mnt_want_write_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like mnt_want_write, but it takes a file and can
- * do some optimisations if the file is open for write already
+ * This is like mnt_want_write, but if the file is already open for writing it
+ * skips incrementing mnt_writers (since the open file already has a reference)
+ * and instead only does the freeze protection and the check for emergency r/o
+ * remounts. This must be paired with mnt_drop_write_file.
*/
int mnt_want_write_file(struct file *file)
{
@@ -470,7 +456,8 @@ EXPORT_SYMBOL_GPL(mnt_drop_write);
void __mnt_drop_write_file(struct file *file)
{
- __mnt_drop_write(file->f_path.mnt);
+ if (!(file->f_mode & FMODE_WRITER))
+ __mnt_drop_write(file->f_path.mnt);
}
void mnt_drop_write_file(struct file *file)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1a96ce28efb0..fe860c538747 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,13 +115,13 @@ bl_submit_bio(struct bio *bio)
return NULL;
}
-static struct bio *
-bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+static struct bio *bl_alloc_init_bio(unsigned int npg,
+ struct block_device *bdev, sector_t disk_sector,
bio_end_io_t end_io, struct parallel_io *par)
{
struct bio *bio;
- npg = min(npg, BIO_MAX_PAGES);
+ npg = bio_max_segs(npg);
bio = bio_alloc(GFP_NOIO, npg);
if (bio) {
bio->bi_iter.bi_sector = disk_sector;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 63940a7a70be..16ad5050e046 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -89,7 +89,7 @@ nfs_file_release(struct inode *inode, struct file *filp)
EXPORT_SYMBOL_GPL(nfs_file_release);
/**
- * nfs_revalidate_size - Revalidate the file size
+ * nfs_revalidate_file_size - Revalidate the file size
* @inode: pointer to inode struct
* @filp: pointer to struct file
*
@@ -606,8 +606,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- unsigned long written = 0;
- ssize_t result;
+ unsigned int mntflags = NFS_SERVER(inode)->flags;
+ ssize_t result, written;
errseq_t since;
int error;
@@ -626,13 +626,13 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
/*
* O_APPEND implies that we must revalidate the file length.
*/
- if (iocb->ki_flags & IOCB_APPEND) {
+ if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) {
result = nfs_revalidate_file_size(inode, file);
if (result)
goto out;
}
- if (iocb->ki_pos > i_size_read(inode))
- nfs_revalidate_mapping(inode, file->f_mapping);
+
+ nfs_clear_invalid_mapping(file->f_mapping);
since = filemap_sample_wb_err(file->f_mapping);
nfs_start_io_write(inode);
@@ -648,6 +648,21 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
written = result;
iocb->ki_pos += written;
+
+ if (mntflags & NFS_MOUNT_WRITE_EAGER) {
+ result = filemap_fdatawrite_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
+ if (result < 0)
+ goto out;
+ }
+ if (mntflags & NFS_MOUNT_WRITE_WAIT) {
+ result = filemap_fdatawait_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
+ if (result < 0)
+ goto out;
+ }
result = generic_write_sync(iocb, written);
if (result < 0)
goto out;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 06894bcdea2d..971a9251c1d9 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -82,6 +82,7 @@ enum nfs_param {
Opt_v,
Opt_vers,
Opt_wsize,
+ Opt_write,
};
enum {
@@ -113,6 +114,19 @@ static const struct constant_table nfs_param_enums_lookupcache[] = {
{}
};
+enum {
+ Opt_write_lazy,
+ Opt_write_eager,
+ Opt_write_wait,
+};
+
+static const struct constant_table nfs_param_enums_write[] = {
+ { "lazy", Opt_write_lazy },
+ { "eager", Opt_write_eager },
+ { "wait", Opt_write_wait },
+ {}
+};
+
static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_flag_no("ac", Opt_ac),
fsparam_u32 ("acdirmax", Opt_acdirmax),
@@ -171,6 +185,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_flag ("v4.1", Opt_v),
fsparam_flag ("v4.2", Opt_v),
fsparam_string("vers", Opt_vers),
+ fsparam_enum ("write", Opt_write, nfs_param_enums_write),
fsparam_u32 ("wsize", Opt_wsize),
{}
};
@@ -770,6 +785,24 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
goto out_invalid_value;
}
break;
+ case Opt_write:
+ switch (result.uint_32) {
+ case Opt_write_lazy:
+ ctx->flags &=
+ ~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT);
+ break;
+ case Opt_write_eager:
+ ctx->flags |= NFS_MOUNT_WRITE_EAGER;
+ ctx->flags &= ~NFS_MOUNT_WRITE_WAIT;
+ break;
+ case Opt_write_wait:
+ ctx->flags |=
+ NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT;
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ break;
/*
* Special options
@@ -1479,6 +1512,8 @@ static int nfs_init_fs_context(struct fs_context *fc)
ctx->selected_flavor = RPC_AUTH_MAXFLAVOR;
ctx->minorversion = 0;
ctx->need_mount = true;
+
+ fc->s_iflags |= SB_I_STABLE_WRITES;
}
fc->fs_private = ctx;
fc->ops = &nfs_fs_context_ops;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index a60df88efc40..c4c021c6ebbd 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -390,10 +390,6 @@ static void nfs_readpage_from_fscache_complete(struct page *page,
if (!error) {
SetPageUptodate(page);
unlock_page(page);
- } else {
- error = nfs_readpage_async(context, page->mapping->host, page);
- if (error)
- unlock_page(page);
}
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 447e95974386..749bbea14d99 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -195,6 +195,18 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
}
EXPORT_SYMBOL_GPL(nfs_check_cache_invalid);
+#ifdef CONFIG_NFS_V4_2
+static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
+{
+ return nfsi->xattr_cache != NULL;
+}
+#else
+static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
+{
+ return false;
+}
+#endif
+
static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -209,6 +221,8 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
| NFS_INO_INVALID_XATTR);
}
+ if (!nfs_has_xattr_cache(nfsi))
+ flags &= ~NFS_INO_INVALID_XATTR;
if (inode->i_mapping->nrpages == 0)
flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
nfsi->cache_validity |= flags;
@@ -1258,55 +1272,19 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
return 0;
}
-bool nfs_mapping_need_revalidate_inode(struct inode *inode)
-{
- return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
- NFS_STALE(inode);
-}
-
-int nfs_revalidate_mapping_rcu(struct inode *inode)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
- unsigned long *bitlock = &nfsi->flags;
- int ret = 0;
-
- if (IS_SWAPFILE(inode))
- goto out;
- if (nfs_mapping_need_revalidate_inode(inode)) {
- ret = -ECHILD;
- goto out;
- }
- spin_lock(&inode->i_lock);
- if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
- (nfsi->cache_validity & NFS_INO_INVALID_DATA))
- ret = -ECHILD;
- spin_unlock(&inode->i_lock);
-out:
- return ret;
-}
-
/**
- * nfs_revalidate_mapping - Revalidate the pagecache
- * @inode: pointer to host inode
+ * nfs_clear_invalid_mapping - Conditionally clear a mapping
* @mapping: pointer to mapping
+ *
+ * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping.
*/
-int nfs_revalidate_mapping(struct inode *inode,
- struct address_space *mapping)
+int nfs_clear_invalid_mapping(struct address_space *mapping)
{
+ struct inode *inode = mapping->host;
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long *bitlock = &nfsi->flags;
int ret = 0;
- /* swapfiles are not supposed to be shared. */
- if (IS_SWAPFILE(inode))
- goto out;
-
- if (nfs_mapping_need_revalidate_inode(inode)) {
- ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
- if (ret < 0)
- goto out;
- }
-
/*
* We must clear NFS_INO_INVALID_DATA first to ensure that
* invalidations that come in while we're shooting down the mappings
@@ -1337,8 +1315,8 @@ int nfs_revalidate_mapping(struct inode *inode,
set_bit(NFS_INO_INVALIDATING, bitlock);
smp_wmb();
- nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA|
- NFS_INO_DATA_INVAL_DEFER);
+ nfsi->cache_validity &=
+ ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER);
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
ret = nfs_invalidate_mapping(inode, mapping);
@@ -1351,6 +1329,53 @@ out:
return ret;
}
+bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+{
+ return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
+ NFS_STALE(inode);
+}
+
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
+ int ret = 0;
+
+ if (IS_SWAPFILE(inode))
+ goto out;
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ ret = -ECHILD;
+ goto out;
+ }
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+ (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ ret = -ECHILD;
+ spin_unlock(&inode->i_lock);
+out:
+ return ret;
+}
+
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode: pointer to host inode
+ * @mapping: pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+ /* swapfiles are not supposed to be shared. */
+ if (IS_SWAPFILE(inode))
+ return 0;
+
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return nfs_clear_invalid_mapping(mapping);
+}
+
static bool nfs_file_has_writers(struct nfs_inode *nfsi)
{
struct inode *inode = &nfsi->vfs_inode;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 5604e807fc01..bb386a691e69 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -111,6 +111,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
fallthrough;
case -ENOTSUPP:
status = -EOPNOTSUPP;
+ goto getout;
default:
goto getout;
}
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 86acffe7335c..889a9f4c0310 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -609,6 +609,7 @@ found:
* changed. Schedule recovery!
*/
nfs4_schedule_path_down_recovery(pos);
+ goto out;
default:
goto out;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a07530cf673d..74bc5120013d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -71,10 +71,6 @@
#include "nfs4trace.h"
-#ifdef CONFIG_NFS_V4_2
-#include "nfs42.h"
-#endif /* CONFIG_NFS_V4_2 */
-
#define NFSDBG_FACILITY NFSDBG_PROC
#define NFS4_BITMASK_SZ 3
@@ -2231,6 +2227,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
default:
printk(KERN_ERR "NFS: %s: unhandled error "
"%d.\n", __func__, err);
+ fallthrough;
case 0:
case -ENOENT:
case -EAGAIN:
@@ -5438,15 +5435,16 @@ static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
if (cache_validity & NFS_INO_INVALID_ATIME)
bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
- if (cache_validity & NFS_INO_INVALID_ACCESS)
- bitmask[0] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
- FATTR4_WORD1_OWNER_GROUP;
- if (cache_validity & NFS_INO_INVALID_ACL)
- bitmask[0] |= FATTR4_WORD0_ACL;
- if (cache_validity & NFS_INO_INVALID_LABEL)
+ if (cache_validity & NFS_INO_INVALID_OTHER)
+ bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
+ FATTR4_WORD1_OWNER_GROUP |
+ FATTR4_WORD1_NUMLINKS;
+ if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL)
bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
- if (cache_validity & NFS_INO_INVALID_CTIME)
+ if (cache_validity & NFS_INO_INVALID_CHANGE)
bitmask[0] |= FATTR4_WORD0_CHANGE;
+ if (cache_validity & NFS_INO_INVALID_CTIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_METADATA;
if (cache_validity & NFS_INO_INVALID_MTIME)
bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
if (cache_validity & NFS_INO_INVALID_SIZE)
@@ -9708,6 +9706,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_BADLAYOUT: /* no layout */
case -NFS4ERR_GRACE: /* loca_recalim always false */
task->tk_status = 0;
+ break;
case 0:
break;
default:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 4bf10792cb5b..3a51351bdc6a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1125,6 +1125,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
" sequence-id error on an"
" unconfirmed sequence %p!\n",
seqid->sequence);
+ return;
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_BAD_STATEID:
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index af64b4e6fd1f..102b66e0bdef 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2875,6 +2875,7 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
switch (trypnfs) {
case PNFS_NOT_ATTEMPTED:
pnfs_write_through_mds(desc, hdr);
+ break;
case PNFS_ATTEMPTED:
break;
case PNFS_TRY_AGAIN:
@@ -3019,6 +3020,7 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
switch (trypnfs) {
case PNFS_NOT_ATTEMPTED:
pnfs_read_through_mds(desc, hdr);
+ break;
case PNFS_ATTEMPTED:
break;
case PNFS_TRY_AGAIN:
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index eb854f1f86e2..d2b6dce1f99f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -74,6 +74,24 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
+static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode)
+{
+ struct nfs_pgio_mirror *pgm;
+ unsigned long npages;
+
+ nfs_pageio_complete(pgio);
+
+ /* It doesn't make sense to do mirrored reads! */
+ WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+
+ pgm = &pgio->pg_mirrors[0];
+ NFS_I(inode)->read_io += pgm->pg_bytes_written;
+ npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+}
+
+
void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
{
struct nfs_pgio_mirror *mirror;
@@ -114,41 +132,10 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
nfs_release_request(req);
}
-int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
- struct page *page)
-{
- struct nfs_page *new;
- unsigned int len;
+struct nfs_readdesc {
struct nfs_pageio_descriptor pgio;
- struct nfs_pgio_mirror *pgm;
-
- len = nfs_page_length(page);
- if (len == 0)
- return nfs_return_empty_page(page);
- new = nfs_create_request(ctx, page, 0, len);
- if (IS_ERR(new)) {
- unlock_page(page);
- return PTR_ERR(new);
- }
- if (len < PAGE_SIZE)
- zero_user_segment(page, len, PAGE_SIZE);
-
- nfs_pageio_init_read(&pgio, inode, false,
- &nfs_async_read_completion_ops);
- if (!nfs_pageio_add_request(&pgio, new)) {
- nfs_list_remove_request(new);
- nfs_readpage_release(new, pgio.pg_error);
- }
- nfs_pageio_complete(&pgio);
-
- /* It doesn't make sense to do mirrored reads! */
- WARN_ON_ONCE(pgio.pg_mirror_count != 1);
-
- pgm = &pgio.pg_mirrors[0];
- NFS_I(inode)->read_io += pgm->pg_bytes_written;
-
- return pgio.pg_error < 0 ? pgio.pg_error : 0;
-}
+ struct nfs_open_context *ctx;
+};
static void nfs_page_group_set_uptodate(struct nfs_page *req)
{
@@ -171,8 +158,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
/* note: regions of the page not covered by a
- * request are zeroed in nfs_readpage_async /
- * readpage_async_filler */
+ * request are zeroed in readpage_async_filler */
if (bytes > hdr->good_bytes) {
/* nothing in this request was good, so zero
* the full extent of the request */
@@ -304,6 +290,38 @@ static void nfs_readpage_result(struct rpc_task *task,
nfs_readpage_retry(task, hdr);
}
+static int
+readpage_async_filler(void *data, struct page *page)
+{
+ struct nfs_readdesc *desc = data;
+ struct nfs_page *new;
+ unsigned int len;
+ int error;
+
+ len = nfs_page_length(page);
+ if (len == 0)
+ return nfs_return_empty_page(page);
+
+ new = nfs_create_request(desc->ctx, page, 0, len);
+ if (IS_ERR(new))
+ goto out_error;
+
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
+ if (!nfs_pageio_add_request(&desc->pgio, new)) {
+ nfs_list_remove_request(new);
+ error = desc->pgio.pg_error;
+ nfs_readpage_release(new, error);
+ goto out;
+ }
+ return 0;
+out_error:
+ error = PTR_ERR(new);
+ unlock_page(page);
+out:
+ return error;
+}
+
/*
* Read a page over NFS.
* We read the page synchronously in the following case:
@@ -312,14 +330,13 @@ static void nfs_readpage_result(struct rpc_task *task,
*/
int nfs_readpage(struct file *file, struct page *page)
{
- struct nfs_open_context *ctx;
+ struct nfs_readdesc desc;
struct inode *inode = page_file_mapping(page)->host;
- int error;
+ int ret;
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
page, PAGE_SIZE, page_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
- nfs_add_stats(inode, NFSIOS_READPAGES, 1);
/*
* Try to flush any pending writes to the file..
@@ -328,93 +345,59 @@ int nfs_readpage(struct file *file, struct page *page)
* be any new pending writes generated at this point
* for this page (other pages can be written to).
*/
- error = nfs_wb_page(inode, page);
- if (error)
+ ret = nfs_wb_page(inode, page);
+ if (ret)
goto out_unlock;
if (PageUptodate(page))
goto out_unlock;
- error = -ESTALE;
+ ret = -ESTALE;
if (NFS_STALE(inode))
goto out_unlock;
if (file == NULL) {
- error = -EBADF;
- ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
- if (ctx == NULL)
+ ret = -EBADF;
+ desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
+ if (desc.ctx == NULL)
goto out_unlock;
} else
- ctx = get_nfs_open_context(nfs_file_open_context(file));
+ desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
if (!IS_SYNC(inode)) {
- error = nfs_readpage_from_fscache(ctx, inode, page);
- if (error == 0)
+ ret = nfs_readpage_from_fscache(desc.ctx, inode, page);
+ if (ret == 0)
goto out;
}
- xchg(&ctx->error, 0);
- error = nfs_readpage_async(ctx, inode, page);
- if (!error) {
- error = wait_on_page_locked_killable(page);
- if (!PageUptodate(page) && !error)
- error = xchg(&ctx->error, 0);
- }
-out:
- put_nfs_open_context(ctx);
- return error;
-out_unlock:
- unlock_page(page);
- return error;
-}
-
-struct nfs_readdesc {
- struct nfs_pageio_descriptor *pgio;
- struct nfs_open_context *ctx;
-};
-
-static int
-readpage_async_filler(void *data, struct page *page)
-{
- struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
- struct nfs_page *new;
- unsigned int len;
- int error;
+ xchg(&desc.ctx->error, 0);
+ nfs_pageio_init_read(&desc.pgio, inode, false,
+ &nfs_async_read_completion_ops);
- len = nfs_page_length(page);
- if (len == 0)
- return nfs_return_empty_page(page);
+ ret = readpage_async_filler(&desc, page);
- new = nfs_create_request(desc->ctx, page, 0, len);
- if (IS_ERR(new))
- goto out_error;
+ if (!ret)
+ nfs_pageio_complete_read(&desc.pgio, inode);
- if (len < PAGE_SIZE)
- zero_user_segment(page, len, PAGE_SIZE);
- if (!nfs_pageio_add_request(desc->pgio, new)) {
- nfs_list_remove_request(new);
- error = desc->pgio->pg_error;
- nfs_readpage_release(new, error);
- goto out;
+ ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0;
+ if (!ret) {
+ ret = wait_on_page_locked_killable(page);
+ if (!PageUptodate(page) && !ret)
+ ret = xchg(&desc.ctx->error, 0);
}
- return 0;
-out_error:
- error = PTR_ERR(new);
- unlock_page(page);
out:
- return error;
+ put_nfs_open_context(desc.ctx);
+ return ret;
+out_unlock:
+ unlock_page(page);
+ return ret;
}
-int nfs_readpages(struct file *filp, struct address_space *mapping,
+int nfs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
- struct nfs_pageio_descriptor pgio;
- struct nfs_pgio_mirror *pgm;
- struct nfs_readdesc desc = {
- .pgio = &pgio,
- };
+ struct nfs_readdesc desc;
struct inode *inode = mapping->host;
- unsigned long npages;
- int ret = -ESTALE;
+ int ret;
dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
inode->i_sb->s_id,
@@ -422,15 +405,17 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
nr_pages);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
+ ret = -ESTALE;
if (NFS_STALE(inode))
goto out;
- if (filp == NULL) {
+ if (file == NULL) {
+ ret = -EBADF;
desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
if (desc.ctx == NULL)
- return -EBADF;
+ goto out;
} else
- desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
+ desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
/* attempt to read as many of the pages as possible from the cache
* - this returns -ENOBUFS immediately if the cookie is negative
@@ -440,20 +425,13 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0)
goto read_complete; /* all pages were read */
- nfs_pageio_init_read(&pgio, inode, false,
+ nfs_pageio_init_read(&desc.pgio, inode, false,
&nfs_async_read_completion_ops);
ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
- nfs_pageio_complete(&pgio);
- /* It doesn't make sense to do mirrored reads! */
- WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+ nfs_pageio_complete_read(&desc.pgio, inode);
- pgm = &pgio.pg_mirrors[0];
- NFS_I(inode)->read_io += pgm->pg_bytes_written;
- npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- nfs_add_stats(inode, NFSIOS_READPAGES, npages);
read_complete:
put_nfs_open_context(desc.ctx);
out:
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c7a924580eec..94885c6f8f54 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -523,6 +523,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
seq_puts(m, ",local_lock=flock");
else
seq_puts(m, ",local_lock=posix");
+
+ if (nfss->flags & NFS_MOUNT_WRITE_EAGER) {
+ if (nfss->flags & NFS_MOUNT_WRITE_WAIT)
+ seq_puts(m, ",write=wait");
+ else
+ seq_puts(m, ",write=eager");
+ }
}
/*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 639c34fec04a..82bdcb982186 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -712,16 +712,23 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct nfs_pageio_descriptor pgio;
- struct nfs_io_completion *ioc;
+ struct nfs_io_completion *ioc = NULL;
+ unsigned int mntflags = NFS_SERVER(inode)->flags;
+ int priority = 0;
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
- ioc = nfs_io_completion_alloc(GFP_KERNEL);
- if (ioc)
- nfs_io_completion_init(ioc, nfs_io_completion_commit, inode);
+ if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
+ wbc->for_background || wbc->for_sync || wbc->for_reclaim) {
+ ioc = nfs_io_completion_alloc(GFP_KERNEL);
+ if (ioc)
+ nfs_io_completion_init(ioc, nfs_io_completion_commit,
+ inode);
+ priority = wb_priority(wbc);
+ }
- nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
+ nfs_pageio_init_write(&pgio, inode, priority, false,
&nfs_async_write_completion_ops);
pgio.pg_io_completion = ioc;
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
@@ -1278,19 +1285,21 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
* the PageUptodate() flag. In this case, we will need to turn off
* write optimisations that depend on the page contents being correct.
*/
-static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
+static bool nfs_write_pageuptodate(struct page *page, struct inode *inode,
+ unsigned int pagelen)
{
struct nfs_inode *nfsi = NFS_I(inode);
if (nfs_have_delegated_attributes(inode))
goto out;
- if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+ if (nfsi->cache_validity &
+ (NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_SIZE))
return false;
smp_rmb();
- if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
+ if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0)
return false;
out:
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0)
return false;
return PageUptodate(page) != 0;
}
@@ -1310,7 +1319,8 @@ is_whole_file_wrlock(struct file_lock *fl)
* If the file is opened for synchronous writes then we can just skip the rest
* of the checks.
*/
-static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
+static int nfs_can_extend_write(struct file *file, struct page *page,
+ struct inode *inode, unsigned int pagelen)
{
int ret;
struct file_lock_context *flctx = inode->i_flctx;
@@ -1318,7 +1328,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
if (file->f_flags & O_DSYNC)
return 0;
- if (!nfs_write_pageuptodate(page, inode))
+ if (!nfs_write_pageuptodate(page, inode, pagelen))
return 0;
if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
return 1;
@@ -1356,6 +1366,7 @@ int nfs_updatepage(struct file *file, struct page *page,
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct address_space *mapping = page_file_mapping(page);
struct inode *inode = mapping->host;
+ unsigned int pagelen = nfs_page_length(page);
int status = 0;
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@@ -1366,8 +1377,8 @@ int nfs_updatepage(struct file *file, struct page *page,
if (!count)
goto out;
- if (nfs_can_extend_write(file, page, inode)) {
- count = max(count + offset, nfs_page_length(page));
+ if (nfs_can_extend_write(file, page, inode, pagelen)) {
+ count = max(count + offset, pagelen);
offset = 0;
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4435dbbc0b63..f5c058b3192c 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -629,6 +629,12 @@ static int ntfs_read_locked_inode(struct inode *vi)
}
a = ctx->attr;
/* Get the standard information attribute value. */
+ if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset)
+ + le32_to_cpu(a->data.resident.value_length) >
+ (u8 *)ctx->mrec + vol->mft_record_size) {
+ ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode.");
+ goto unm_err_out;
+ }
si = (STANDARD_INFORMATION*)((u8*)a +
le16_to_cpu(a->data.resident.value_offset));
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 85422761ff43..5d4bf7a3259f 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -703,7 +703,7 @@ typedef struct {
/* 14*/ le16 instance; /* The instance of this attribute record. This
number is unique within this mft record (see
MFT_RECORD/next_attribute_instance notes in
- in mft.h for more details). */
+ mft.h for more details). */
/* 16*/ union {
/* Resident attributes. */
struct {
@@ -1838,7 +1838,7 @@ typedef struct {
* Also, each security descriptor is stored twice in the $SDS stream with a
* fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
* between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
- * the first copy of the security descriptor will be at offset 0x51d0 in the
+ * first copy of the security descriptor will be at offset 0x51d0 in the
* $SDS data stream and the second copy will be at offset 0x451d0.
*/
typedef struct {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 0179a73a3fa2..12a7590601dd 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2042,7 +2042,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
o2hb_nego_timeout_handler,
reg, NULL, &reg->hr_handler_list);
if (ret)
- goto free;
+ goto remove_item;
ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key,
sizeof(struct o2hb_nego_msg),
@@ -2057,6 +2057,12 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
unregister_handler:
o2net_unregister_handler_list(&reg->hr_handler_list);
+remove_item:
+ spin_lock(&o2hb_live_lock);
+ list_del(&reg->hr_all_item);
+ if (o2hb_global_heartbeat_active())
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
free:
kfree(reg);
return ERR_PTR(ret);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 6abaded3ff6b..70a10764f249 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -165,16 +165,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
spin_unlock(&lock->spinlock);
}
-void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
-{
- BUG_ON(!dlm);
- BUG_ON(!lock);
-
- spin_lock(&dlm->ast_lock);
- __dlm_queue_bast(dlm, lock);
- spin_unlock(&dlm->ast_lock);
-}
-
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index c8a444622faa..58d57e25d384 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -17,10 +17,7 @@
#define DLM_LOCKID_NAME_MAX 32
-#define DLM_DOMAIN_NAME_MAX_LEN 255
#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES
-#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
-#define DLM_THREAD_MS 200 // flush at least every 200 ms
#define DLM_HASH_SIZE_DEFAULT (1 << 17)
#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
@@ -902,7 +899,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
-void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_do_local_ast(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index c26937824be1..c19a463fac55 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -978,7 +978,7 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
return 0;
}
- if (!eb || (eb && !eb->h_next_leaf_blk)) {
+ if (!eb || !eb->h_next_leaf_blk) {
/*
* We are the last extent rec, so any high cpos should
* be stored in this leaf refcount block.
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2febc76e9de7..079f8826993e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -973,8 +973,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
* quota files */
dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
DQUOT_LIMITS_ENABLED);
- if (!inode)
- continue;
iput(inode);
}
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 39c96845a72f..bfd946a9ad01 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -171,7 +171,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal);
*
* Description:
* This function grabs an extra reference to @buf. It's used in
- * in the tee() system call, when we duplicate the buffers in one
+ * the tee() system call, when we duplicate the buffers in one
* pipe into another.
*/
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 56bf14316122..3851bfcdba56 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -67,7 +67,6 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
-#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
@@ -386,19 +385,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
unsigned long wchan;
- char symname[KSYM_NAME_LEN];
- if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
- goto print0;
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ wchan = get_wchan(task);
+ else
+ wchan = 0;
- wchan = get_wchan(task);
- if (wchan && !lookup_symbol_name(wchan, symname)) {
- seq_puts(m, symname);
- return 0;
- }
+ if (wchan)
+ seq_printf(m, "%ps", (void *) wchan);
+ else
+ seq_putc(m, '0');
-print0:
- seq_putc(m, '0');
return 0;
}
#endif /* CONFIG_KALLSYMS */
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d6fc74619625..6fa761c9cc78 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -129,15 +129,15 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
show_val_kb(m, "AnonHugePages: ",
- global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_ANON_THPS));
show_val_kb(m, "ShmemHugePages: ",
- global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_SHMEM_THPS));
show_val_kb(m, "ShmemPmdMapped: ",
- global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+ global_node_page_state(NR_SHMEM_PMDMAPPED));
show_val_kb(m, "FileHugePages: ",
- global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_FILE_THPS));
show_val_kb(m, "FilePmdMapped: ",
- global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
+ global_node_page_state(NR_FILE_PMDMAPPED));
#endif
#ifdef CONFIG_CMA
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 656ba24c317d..984e42f8cb11 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -571,7 +571,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
error = -ENOMEM;
if (count >= KMALLOC_MAX_SIZE)
goto out;
- kbuf = kzalloc(count + 1, GFP_KERNEL);
+ kbuf = kvzalloc(count + 1, GFP_KERNEL);
if (!kbuf)
goto out;
@@ -600,7 +600,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
error = count;
out_free_buf:
- kfree(kbuf);
+ kvfree(kbuf);
out:
sysctl_head_finish(head);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index a4012154e109..72cd69bcaf4a 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,13 +16,6 @@ static const char *proc_self_get_link(struct dentry *dentry,
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
- /*
- * Not currently supported. Once we can inherit all of struct pid,
- * we can allow this.
- */
- if (current->flags & PF_IO_WORKER)
- return ERR_PTR(-EOPNOTSUPP);
-
if (!tgid)
return ERR_PTR(-ENOENT);
/* max length of unsigned int in decimal + NULL term */
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index d56681d86d28..a553273fbd41 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -17,13 +17,6 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
- /*
- * Not currently supported. Once we can inherit all of struct pid,
- * we can allow this.
- */
- if (current->flags & PF_IO_WORKER)
- return ERR_PTR(-EOPNOTSUPP);
-
if (!pid)
return ERR_PTR(-ENOENT);
name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index c3a345c28a93..9a15334da208 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1503,11 +1503,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
return 0;
out_err:
- if (buf)
- vfree(buf);
-
- if (dump)
- vfree(dump);
+ vfree(buf);
+ vfree(dump);
return ret;
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 93a217e4f563..14658b009f1b 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -467,7 +467,7 @@ static struct dentry *pstore_mount(struct file_system_type *fs_type,
static void pstore_kill_sb(struct super_block *sb)
{
mutex_lock(&pstore_sb_lock);
- WARN_ON(pstore_sb != sb);
+ WARN_ON(pstore_sb && pstore_sb != sb);
kill_litter_super(sb);
pstore_sb = NULL;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index aa8e0b65ff1a..fff363bfd484 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -246,7 +246,7 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
pr_info("error in header, %d\n", numerr);
prz->corrected_bytes += numerr;
} else if (numerr < 0) {
- pr_info("uncorrectable error in header\n");
+ pr_info_ratelimited("uncorrectable error in header\n");
prz->bad_blocks++;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 3c2658c8fde0..9ebd17d7befb 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -151,6 +151,18 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
return error;
}
+static int ramfs_tmpfile(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
+ if (!inode)
+ return -ENOSPC;
+ d_tmpfile(dentry, inode);
+ return 0;
+}
+
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
@@ -161,6 +173,7 @@ static const struct inode_operations ramfs_dir_inode_operations = {
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
+ .tmpfile = ramfs_tmpfile,
};
/*
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index b56ff451adce..5b6fcb9b44e2 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2805,7 +2805,7 @@ xfs_btree_split_worker(
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
unsigned long pflags;
- unsigned long new_pflags = PF_MEMALLOC_NOFS;
+ unsigned long new_pflags = 0;
/*
* we are in a transaction context here, but may also be doing work
@@ -2817,12 +2817,20 @@ xfs_btree_split_worker(
new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
current_set_flags_nested(&pflags, new_pflags);
+ xfs_trans_set_context(args->cur->bc_tp);
args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
args->key, args->curp, args->stat);
- complete(args->done);
+ xfs_trans_clear_context(args->cur->bc_tp);
current_restore_flags_nested(&pflags, new_pflags);
+
+ /*
+ * Do not access args after complete() has run here. We don't own args
+ * and the owner may run and free args before we return here.
+ */
+ complete(args->done);
+
}
/*
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4304c6416fbb..b4186d666157 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -62,7 +62,7 @@ xfs_setfilesize_trans_alloc(
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ xfs_trans_clear_context(tp);
return 0;
}
@@ -125,7 +125,7 @@ xfs_setfilesize_ioend(
* thus we need to mark ourselves as being in a transaction manually.
* Similarly for freeze protection.
*/
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ xfs_trans_set_context(tp);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
@@ -568,6 +568,12 @@ xfs_vm_writepage(
{
struct xfs_writepage_ctx wpc = { };
+ if (WARN_ON_ONCE(current->journal_info)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+
return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
}
@@ -578,6 +584,13 @@ xfs_vm_writepages(
{
struct xfs_writepage_ctx wpc = { };
+ /*
+ * Writing back data in a transaction context can result in recursive
+ * transactions. This is bad, so issue a warning and get out of here.
+ */
+ if (WARN_ON_ONCE(current->journal_info))
+ return 0;
+
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index e2148f2d5d6b..17f36db2f792 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -6,7 +6,7 @@
static inline unsigned int bio_max_vecs(unsigned int count)
{
- return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
+ return bio_max_segs(howmany(count, PAGE_SIZE));
}
int
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f6e5235df7c9..37a1d12762d8 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1480,7 +1480,7 @@ xfs_buf_ioapply_map(
int op)
{
int page_index;
- int total_nr_pages = bp->b_page_count;
+ unsigned int total_nr_pages = bp->b_page_count;
int nr_pages;
struct bio *bio;
sector_t sector = bp->b_maps[map].bm_bn;
@@ -1505,7 +1505,7 @@ xfs_buf_ioapply_map(
next_chunk:
atomic_inc(&bp->b_io_remaining);
- nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
+ nr_pages = bio_max_segs(total_nr_pages);
bio = bio_alloc(GFP_NOIO, nr_pages);
bio_set_dev(bio, bp->b_target->bt_bdev);
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 3991e59cfd18..ef17c1f6db32 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -344,7 +344,6 @@ xfs_extent_busy_trim(
ASSERT(*len > 0);
spin_lock(&args->pag->pagb_lock);
-restart:
fbno = *bno;
flen = *len;
rbp = args->pag->pagb_tree.rb_node;
@@ -363,19 +362,6 @@ restart:
continue;
}
- /*
- * If this is a metadata allocation, try to reuse the busy
- * extent instead of trimming the allocation.
- */
- if (!(args->datatype & XFS_ALLOC_USERDATA) &&
- !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
- if (!xfs_extent_busy_update_extent(args->mp, args->pag,
- busyp, fbno, flen,
- false))
- goto restart;
- continue;
- }
-
if (bbno <= fbno) {
/* start overlap */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 145e06c47744..546a6cd96729 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -51,7 +51,7 @@ xfs_panic_mask_proc_handler(
#endif /* CONFIG_PROC_FS */
STATIC int
-xfs_deprecate_irix_sgid_inherit_proc_handler(
+xfs_deprecated_dointvec_minmax(
struct ctl_table *ctl,
int write,
void *buffer,
@@ -59,24 +59,8 @@ xfs_deprecate_irix_sgid_inherit_proc_handler(
loff_t *ppos)
{
if (write) {
- printk_once(KERN_WARNING
- "XFS: " "%s sysctl option is deprecated.\n",
- ctl->procname);
- }
- return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
-}
-
-STATIC int
-xfs_deprecate_irix_symlink_mode_proc_handler(
- struct ctl_table *ctl,
- int write,
- void *buffer,
- size_t *lenp,
- loff_t *ppos)
-{
- if (write) {
- printk_once(KERN_WARNING
- "XFS: " "%s sysctl option is deprecated.\n",
+ printk_ratelimited(KERN_WARNING
+ "XFS: %s sysctl option is deprecated.\n",
ctl->procname);
}
return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
@@ -88,7 +72,7 @@ static struct ctl_table xfs_table[] = {
.data = &xfs_params.sgid_inherit.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
.extra1 = &xfs_params.sgid_inherit.min,
.extra2 = &xfs_params.sgid_inherit.max
},
@@ -97,7 +81,7 @@ static struct ctl_table xfs_table[] = {
.data = &xfs_params.symlink_mode.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
.extra1 = &xfs_params.symlink_mode.min,
.extra2 = &xfs_params.symlink_mode.max
},
@@ -201,6 +185,15 @@ static struct ctl_table xfs_table[] = {
.extra1 = &xfs_params.blockgc_timer.min,
.extra2 = &xfs_params.blockgc_timer.max,
},
+ {
+ .procname = "speculative_cow_prealloc_lifetime",
+ .data = &xfs_params.blockgc_timer.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = xfs_deprecated_dointvec_minmax,
+ .extra1 = &xfs_params.blockgc_timer.min,
+ .extra2 = &xfs_params.blockgc_timer.max,
+ },
/* please keep this the last entry */
#ifdef CONFIG_PROC_FS
{
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 44f72c09c203..b22a09e9daee 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -72,6 +72,7 @@ xfs_trans_free(
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
trace_xfs_trans_free(tp, _RET_IP_);
+ xfs_trans_clear_context(tp);
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
@@ -123,7 +124,8 @@ xfs_trans_dup(
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
tp->t_rtx_res = tp->t_rtx_res_used;
- ntp->t_pflags = tp->t_pflags;
+
+ xfs_trans_switch_context(tp, ntp);
/* move deferred ops over to the new tp */
xfs_defer_move(ntp, tp);
@@ -157,9 +159,6 @@ xfs_trans_reserve(
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
- /* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
/*
* Attempt to reserve the needed disk blocks by decrementing
* the number needed from the number available. This will
@@ -167,10 +166,8 @@ xfs_trans_reserve(
*/
if (blocks > 0) {
error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
- if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ if (error != 0)
return -ENOSPC;
- }
tp->t_blk_res += blocks;
}
@@ -244,9 +241,6 @@ undo_blocks:
xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
-
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
return error;
}
@@ -260,6 +254,7 @@ xfs_trans_alloc(
struct xfs_trans **tpp)
{
struct xfs_trans *tp;
+ bool want_retry = true;
int error;
/*
@@ -267,9 +262,11 @@ xfs_trans_alloc(
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
+retry:
tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
+ xfs_trans_set_context(tp);
/*
* Zero-reservation ("empty") transactions can't modify anything, so
@@ -289,7 +286,9 @@ xfs_trans_alloc(
tp->t_firstblock = NULLFSBLOCK;
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
- if (error == -ENOSPC) {
+ if (error == -ENOSPC && want_retry) {
+ xfs_trans_cancel(tp);
+
/*
* We weren't able to reserve enough space for the transaction.
* Flush the other speculative space allocations to free space.
@@ -297,8 +296,11 @@ xfs_trans_alloc(
* other locks.
*/
error = xfs_blockgc_free_space(mp, NULL);
- if (!error)
- error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+ if (error)
+ return error;
+
+ want_retry = false;
+ goto retry;
}
if (error) {
xfs_trans_cancel(tp);
@@ -893,7 +895,6 @@ __xfs_trans_commit(
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free(tp);
/*
@@ -925,7 +926,6 @@ out_unreserve:
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, !!error);
xfs_trans_free(tp);
@@ -985,9 +985,6 @@ xfs_trans_cancel(
tp->t_ticket = NULL;
}
- /* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
xfs_trans_free_items(tp, dirty);
xfs_trans_free(tp);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8b03fbfe9a1b..9dd745cf77c9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -281,4 +281,34 @@ int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
struct xfs_trans **tpp);
+static inline void
+xfs_trans_set_context(
+ struct xfs_trans *tp)
+{
+ ASSERT(current->journal_info == NULL);
+ tp->t_pflags = memalloc_nofs_save();
+ current->journal_info = tp;
+}
+
+static inline void
+xfs_trans_clear_context(
+ struct xfs_trans *tp)
+{
+ if (current->journal_info == tp) {
+ memalloc_nofs_restore(tp->t_pflags);
+ current->journal_info = NULL;
+ }
+}
+
+static inline void
+xfs_trans_switch_context(
+ struct xfs_trans *old_tp,
+ struct xfs_trans *new_tp)
+{
+ ASSERT(current->journal_info == old_tp);
+ new_tp->t_pflags = old_tp->t_pflags;
+ old_tp->t_pflags = 0;
+ current->journal_info = new_tp;
+}
+
#endif /* __XFS_TRANS_H__ */