From 775c5033a0d164622d9d10dd0f0a5531639ed3ed Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 4 Mar 2021 11:09:12 +0200 Subject: fuse: fix live lock in fuse_iget() Commit 5d069dbe8aaf ("fuse: fix bad inode") replaced make_bad_inode() in fuse_iget() with a private implementation fuse_make_bad(). The private implementation fails to remove the bad inode from inode cache, so the retry loop with iget5_locked() finds the same bad inode and marks it bad forever. kmsg snip: [ ] rcu: INFO: rcu_sched self-detected stall on CPU ... [ ] ? bit_wait_io+0x50/0x50 [ ] ? fuse_init_file_inode+0x70/0x70 [ ] ? find_inode.isra.32+0x60/0xb0 [ ] ? fuse_init_file_inode+0x70/0x70 [ ] ilookup5_nowait+0x65/0x90 [ ] ? fuse_init_file_inode+0x70/0x70 [ ] ilookup5.part.36+0x2e/0x80 [ ] ? fuse_init_file_inode+0x70/0x70 [ ] ? fuse_inode_eq+0x20/0x20 [ ] iget5_locked+0x21/0x80 [ ] ? fuse_inode_eq+0x20/0x20 [ ] fuse_iget+0x96/0x1b0 Fixes: 5d069dbe8aaf ("fuse: fix bad inode") Cc: stable@vger.kernel.org # 5.10+ Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 68cca8d4db6e..63d97a15ffde 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -863,6 +863,7 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) static inline void fuse_make_bad(struct inode *inode) { + remove_inode_hash(inode); set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); } -- cgit v1.2.3 From 3f9b9efd82a84f27e95d0414f852caf1fa839e83 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 9 Feb 2021 17:47:54 -0500 Subject: virtiofs: Fail dax mount if device does not support it Right now "mount -t virtiofs -o dax myfs /mnt/virtiofs" succeeds even if filesystem deivce does not have a cache window and hence DAX can't be supported. This gives a false sense to user that they are using DAX with virtiofs but fact of the matter is that they are not. Fix this by returning error if dax can't be supported and user has asked for it. Signed-off-by: Vivek Goyal Reviewed-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 8868ac31a3c0..4ee6f734ba83 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1324,8 +1324,15 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) /* virtiofs allocates and installs its own fuse devices */ ctx->fudptr = NULL; - if (ctx->dax) + if (ctx->dax) { + if (!fs->dax_dev) { + err = -EINVAL; + pr_err("virtio-fs: dax can't be enabled as filesystem" + " device does not support it.\n"); + goto err_free_fuse_devs; + } ctx->dax_dev = fs->dax_dev; + } err = fuse_fill_super_common(sb, ctx); if (err < 0) goto err_free_fuse_devs; -- cgit v1.2.3 From 56887cffe946bb0a90c74429fa94d6110a73119d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2021 10:48:09 +0100 Subject: block: Try to handle busy underlying device on discard Commit 384d87ef2c95 ("block: Do not discard buffers under a mounted filesystem") made paths issuing discard or zeroout requests to the underlying device try to grab block device in exclusive mode. If that failed we returned EBUSY to userspace. This however caused unexpected fallout in userspace where e.g. FUSE filesystems issue discard requests from userspace daemons although the device is open exclusively by the kernel. Also shrinking of logical volume by LVM issues discard requests to a device which may be claimed exclusively because there's another LV on the same PV. So to avoid these userspace regressions, fall back to invalidate_inode_pages2_range() instead of returning EBUSY to userspace and return EBUSY only of that call fails as well (meaning that there's indeed someone using the particular device range we are trying to discard). Link: https://bugzilla.kernel.org/show_bug.cgi?id=211167 Fixes: 384d87ef2c95 ("block: Do not discard buffers under a mounted filesystem") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 4aa1f88d5bf8..03166b3dea4d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -118,13 +118,22 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, if (!(mode & FMODE_EXCL)) { int err = bd_prepare_to_claim(bdev, truncate_bdev_range); if (err) - return err; + goto invalidate; } truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, truncate_bdev_range); return 0; + +invalidate: + /* + * Someone else has handle exclusively open. Try invalidating instead. + * The 'end' argument is inclusive so the rounding is safe. + */ + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, + lstart >> PAGE_SHIFT, + lend >> PAGE_SHIFT); } static void set_init_blocksize(struct block_device *bdev) -- cgit v1.2.3 From efc61345274d6c7a46a0570efbc916fcbe3e927b Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 18 Feb 2021 10:11:32 -0500 Subject: ext4: shrink race window in ext4_should_retry_alloc() When generic/371 is run on kvm-xfstests using 5.10 and 5.11 kernels, it fails at significant rates on the two test scenarios that disable delayed allocation (ext3conv and data_journal) and force actual block allocation for the fallocate and pwrite functions in the test. The failure rate on 5.10 for both ext3conv and data_journal on one test system typically runs about 85%. On 5.11, the failure rate on ext3conv sometimes drops to as low as 1% while the rate on data_journal increases to nearly 100%. The observed failures are largely due to ext4_should_retry_alloc() cutting off block allocation retries when s_mb_free_pending (used to indicate that a transaction in progress will free blocks) is 0. However, free space is usually available when this occurs during runs of generic/371. It appears that a thread attempting to allocate blocks is just missing transaction commits in other threads that increase the free cluster count and reset s_mb_free_pending while the allocating thread isn't running. Explicitly testing for free space availability avoids this race. The current code uses a post-increment operator in the conditional expression that determines whether the retry limit has been exceeded. This means that the conditional expression uses the value of the retry counter before it's increased, resulting in an extra retry cycle. The current code actually retries twice before hitting its retry limit rather than once. Increasing the retry limit to 3 from the current actual maximum retry count of 2 in combination with the change described above reduces the observed failure rate to less that 0.1% on both ext3conv and data_journal with what should be limited impact on users sensitive to the overhead caused by retries. A per filesystem percpu counter exported via sysfs is added to allow users or developers to track the number of times the retry limit is exceeded without resorting to debugging methods. This should provide some insight into worst case retry behavior. Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20210218151132.19678-1-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 38 ++++++++++++++++++++++++++------------ fs/ext4/ext4.h | 1 + fs/ext4/super.c | 5 +++++ fs/ext4/sysfs.c | 7 +++++++ 4 files changed, 39 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f45f9feebe59..74a5172c2d83 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -626,27 +626,41 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi, /** * ext4_should_retry_alloc() - check if a block allocation should be retried - * @sb: super block - * @retries: number of attemps has been made + * @sb: superblock + * @retries: number of retry attempts made so far * - * ext4_should_retry_alloc() is called when ENOSPC is returned, and if - * it is profitable to retry the operation, this function will wait - * for the current or committing transaction to complete, and then - * return TRUE. We will only retry once. + * ext4_should_retry_alloc() is called when ENOSPC is returned while + * attempting to allocate blocks. If there's an indication that a pending + * journal transaction might free some space and allow another attempt to + * succeed, this function will wait for the current or committing transaction + * to complete and then return TRUE. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || - (*retries)++ > 1 || - !EXT4_SB(sb)->s_journal) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!sbi->s_journal) return 0; - smp_mb(); - if (EXT4_SB(sb)->s_mb_free_pending == 0) + if (++(*retries) > 3) { + percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); return 0; + } + /* + * if there's no indication that blocks are about to be freed it's + * possible we just missed a transaction commit that did so + */ + smp_mb(); + if (sbi->s_mb_free_pending == 0) + return ext4_has_free_clusters(sbi, 1, 0); + + /* + * it's possible we've just missed a transaction commit here, + * so ignore the returned status + */ jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + (void) jbd2_journal_force_commit_nested(sbi->s_journal); return 1; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 644fd69185d3..ea3b41579f38 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1484,6 +1484,7 @@ struct ext4_sb_info { struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyclusters_counter; + struct percpu_counter s_sra_exceeded_retry_limit; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; struct kobject s_kobj; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ad34a37278cd..a0a256859662 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1210,6 +1210,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) @@ -5011,6 +5012,9 @@ no_journal: if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, + GFP_KERNEL); if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); @@ -5124,6 +5128,7 @@ failed_mount6: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); failed_mount5: ext4_ext_release(sb); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 075aa3a19ff5..a3d08276d441 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -24,6 +24,7 @@ typedef enum { attr_session_write_kbytes, attr_lifetime_write_kbytes, attr_reserved_clusters, + attr_sra_exceeded_retry_limit, attr_inode_readahead, attr_trigger_test_error, attr_first_error_time, @@ -202,6 +203,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); EXT4_ATTR_FUNC(session_write_kbytes, 0444); EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); EXT4_ATTR_FUNC(reserved_clusters, 0644); +EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); @@ -251,6 +253,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(reserved_clusters), + ATTR_LIST(sra_exceeded_retry_limit), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -374,6 +377,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); + case attr_sra_exceeded_retry_limit: + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) + percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); case attr_inode_readahead: case attr_pointer_ui: if (!ptr) -- cgit v1.2.3 From 163f0ec1df33cf468509ff38cbcbb5eb0d7fac60 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 22 Feb 2021 18:16:26 +0100 Subject: ext4: add reclaim checks to xattr code Syzbot is reporting that ext4 can enter fs reclaim from kvmalloc() while the transaction is started like: fs_reclaim_acquire+0x117/0x150 mm/page_alloc.c:4340 might_alloc include/linux/sched/mm.h:193 [inline] slab_pre_alloc_hook mm/slab.h:493 [inline] slab_alloc_node mm/slub.c:2817 [inline] __kmalloc_node+0x5f/0x430 mm/slub.c:4015 kmalloc_node include/linux/slab.h:575 [inline] kvmalloc_node+0x61/0xf0 mm/util.c:587 kvmalloc include/linux/mm.h:781 [inline] ext4_xattr_inode_cache_find fs/ext4/xattr.c:1465 [inline] ext4_xattr_inode_lookup_create fs/ext4/xattr.c:1508 [inline] ext4_xattr_set_entry+0x1ce6/0x3780 fs/ext4/xattr.c:1649 ext4_xattr_ibody_set+0x78/0x2b0 fs/ext4/xattr.c:2224 ext4_xattr_set_handle+0x8f4/0x13e0 fs/ext4/xattr.c:2380 ext4_xattr_set+0x13a/0x340 fs/ext4/xattr.c:2493 This should be impossible since transaction start sets PF_MEMALLOC_NOFS. Add some assertions to the code to catch if something isn't working as expected early. Link: https://lore.kernel.org/linux-ext4/000000000000563a0205bafb7970@google.com/ Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210222171626.21884-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 372208500f4e..083c95126781 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1462,6 +1462,9 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, if (!ce) return NULL; + WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && + !(current->flags & PF_MEMALLOC_NOFS)); + ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); @@ -2327,6 +2330,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, error = -ENOSPC; goto cleanup; } + WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } error = ext4_reserve_inode_write(handle, inode, &is.iloc); -- cgit v1.2.3 From f91436d55a279f045987e8b8c1385585dca54be9 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Wed, 24 Feb 2021 15:58:00 +0600 Subject: fs/ext4: fix integer overflow in s_log_groups_per_flex syzbot found UBSAN: shift-out-of-bounds in ext4_mb_init [1], when 1 << sbi->s_es->s_log_groups_per_flex is bigger than UINT_MAX, where sbi->s_mb_prefetch is unsigned integer type. 32 is the maximum allowed power of s_log_groups_per_flex. Following if check will also trigger UBSAN shift-out-of-bound: if (1 << sbi->s_es->s_log_groups_per_flex >= UINT_MAX) { So I'm checking it against the raw number, perhaps there is another way to calculate UINT_MAX max power. Also use min_t as to make sure it's uint type. [1] UBSAN: shift-out-of-bounds in fs/ext4/mballoc.c:2713:24 shift exponent 60 is too large for 32-bit type 'int' Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x137/0x1be lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:148 [inline] __ubsan_handle_shift_out_of_bounds+0x432/0x4d0 lib/ubsan.c:395 ext4_mb_init_backend fs/ext4/mballoc.c:2713 [inline] ext4_mb_init+0x19bc/0x19f0 fs/ext4/mballoc.c:2898 ext4_fill_super+0xc2ec/0xfbe0 fs/ext4/super.c:4983 Reported-by: syzbot+a8b4b0c60155e87e9484@syzkaller.appspotmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210224095800.3350002-1-snovitoll@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 99bf091fee10..a02fadf4fc84 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2709,8 +2709,15 @@ static int ext4_mb_init_backend(struct super_block *sb) } if (ext4_has_feature_flex_bg(sb)) { - /* a single flex group is supposed to be read by a single IO */ - sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex, + /* a single flex group is supposed to be read by a single IO. + * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is + * unsigned integer, so the maximum shift is 32. + */ + if (sbi->s_es->s_log_groups_per_flex >= 32) { + ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); + goto err_freesgi; + } + sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ } else { -- cgit v1.2.3 From c915fb80eaa6194fa9bd0a4487705cd5b0dda2f1 Mon Sep 17 00:00:00 2001 From: Zhaolong Zhang Date: Tue, 2 Mar 2021 17:42:31 +0800 Subject: ext4: fix bh ref count on error paths __ext4_journalled_writepage should drop bhs' ref count on error paths Signed-off-by: Zhaolong Zhang Link: https://lore.kernel.org/r/1614678151-70481-1-git-send-email-zhangzl2013@126.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 650c5acd2f2d..a79a9ea58c56 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1938,13 +1938,13 @@ static int __ext4_journalled_writepage(struct page *page, if (!ret) ret = err; - if (!ext4_has_inline_data(inode)) - ext4_walk_page_buffers(NULL, page_bufs, 0, len, - NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: unlock_page(page); out_no_pagelock: + if (!inline_data && page_bufs) + ext4_walk_page_buffers(NULL, page_bufs, 0, len, + NULL, bput_one); brelse(inode_bh); return ret; } -- cgit v1.2.3 From a249cc8bc2e2fed680047d326eb9a50756724198 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 4 Mar 2021 17:42:21 +0000 Subject: cifs: fix credit accounting for extra channel With multichannel, operations like the queries from "ls -lR" can cause all credits to be used and errors to be returned since max_credits was not being set correctly on the secondary channels and thus the client was requesting 0 credits incorrectly in some cases (which can lead to not having enough credits to perform any operation on that channel). Signed-off-by: Aurelien Aptel CC: # v5.8+ Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 10 +++++----- fs/cifs/sess.c | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 112692300fb6..68642e3d4270 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1429,6 +1429,11 @@ smbd_connected: tcp_ses->min_offload = ctx->min_offload; tcp_ses->tcpStatus = CifsNeedNegotiate; + if ((ctx->max_credits < 20) || (ctx->max_credits > 60000)) + tcp_ses->max_credits = SMB2_MAX_CREDITS_AVAILABLE; + else + tcp_ses->max_credits = ctx->max_credits; + tcp_ses->nr_targets = 1; tcp_ses->ignore_signature = ctx->ignore_signature; /* thread spawned, put it on the list */ @@ -2832,11 +2837,6 @@ static int mount_get_conns(struct smb3_fs_context *ctx, struct cifs_sb_info *cif *nserver = server; - if ((ctx->max_credits < 20) || (ctx->max_credits > 60000)) - server->max_credits = SMB2_MAX_CREDITS_AVAILABLE; - else - server->max_credits = ctx->max_credits; - /* get a reference to a SMB session */ ses = cifs_get_smb_ses(server, ctx); if (IS_ERR(ses)) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 183a3a868d7b..63d517b9f2ff 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -230,6 +230,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, ctx.noautotune = ses->server->noautotune; ctx.sockopt_tcp_nodelay = ses->server->tcp_nodelay; ctx.echo_interval = ses->server->echo_interval / HZ; + ctx.max_credits = ses->server->max_credits; /* * This will be used for encoding/decoding user/domain/pw -- cgit v1.2.3 From 88fd98a2306755b965e4f4567f84e73db3b6738c Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 4 Mar 2021 17:51:48 +0000 Subject: cifs: ask for more credit on async read/write code paths When doing a large read or write workload we only very gradually increase the number of credits which can cause problems with parallelizing large i/o (I/O ramps up more slowly than it should for large read/write workloads) especially with multichannel when the number of credits on the secondary channels starts out low (e.g. less than about 130) or when recovering after server throttled back the number of credit. Signed-off-by: Aurelien Aptel Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 4bbb6126b14d..2199a9bfae8f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -4041,8 +4041,7 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = - cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); + shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8); rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (rc) @@ -4348,8 +4347,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = - cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); + shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8); rc = adjust_credits(server, &wdata->credits, wdata->bytes); if (rc) -- cgit v1.2.3 From 886d0137f104a440d9dfa1d16efc1db06c9a2c02 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Mar 2021 12:59:30 -0700 Subject: io-wq: fix race in freeing 'wq' and worker access Ran into a use-after-free on the main io-wq struct, wq. It has a worker ref and completion event, but the manager itself isn't holding a reference. This can lead to a race where the manager thinks there are no workers and exits, but a worker is being added. That leads to the following trace: BUG: KASAN: use-after-free in io_wqe_worker+0x4c0/0x5e0 Read of size 8 at addr ffff888108baa8a0 by task iou-wrk-3080422/3080425 CPU: 5 PID: 3080425 Comm: iou-wrk-3080422 Not tainted 5.12.0-rc1+ #110 Hardware name: Micro-Star International Co., Ltd. MS-7C60/TRX40 PRO 10G (MS-7C60), BIOS 1.60 05/13/2020 Call Trace: dump_stack+0x90/0xbe print_address_description.constprop.0+0x67/0x28d ? io_wqe_worker+0x4c0/0x5e0 kasan_report.cold+0x7b/0xd4 ? io_wqe_worker+0x4c0/0x5e0 __asan_load8+0x6d/0xa0 io_wqe_worker+0x4c0/0x5e0 ? io_worker_handle_work+0xc00/0xc00 ? recalc_sigpending+0xe5/0x120 ? io_worker_handle_work+0xc00/0xc00 ? io_worker_handle_work+0xc00/0xc00 ret_from_fork+0x1f/0x30 Allocated by task 3080422: kasan_save_stack+0x23/0x60 __kasan_kmalloc+0x80/0xa0 kmem_cache_alloc_node_trace+0xa0/0x480 io_wq_create+0x3b5/0x600 io_uring_alloc_task_context+0x13c/0x380 io_uring_add_task_file+0x109/0x140 __x64_sys_io_uring_enter+0x45f/0x660 do_syscall_64+0x32/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 3080422: kasan_save_stack+0x23/0x60 kasan_set_track+0x20/0x40 kasan_set_free_info+0x24/0x40 __kasan_slab_free+0xe8/0x120 kfree+0xa8/0x400 io_wq_put+0x14a/0x220 io_wq_put_and_exit+0x9a/0xc0 io_uring_clean_tctx+0x101/0x140 __io_uring_files_cancel+0x36e/0x3c0 do_exit+0x169/0x1340 __x64_sys_exit+0x34/0x40 do_syscall_64+0x32/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Have the manager itself hold a reference, and now both drop points drop and complete if we hit zero, and the manager can unconditionally do a wait_for_completion() instead of having a race between reading the ref count and waiting if it was non-zero. Fixes: fb3a1f6c745c ("io-wq: have manager wait for all workers to exit") Signed-off-by: Jens Axboe --- fs/io-wq.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 28868eb4cd09..1bfdb86336e4 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -722,9 +722,9 @@ static int io_wq_manager(void *data) io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); rcu_read_unlock(); - /* we might not ever have created any workers */ - if (atomic_read(&wq->worker_refs)) - wait_for_completion(&wq->worker_done); + if (atomic_dec_and_test(&wq->worker_refs)) + complete(&wq->worker_done); + wait_for_completion(&wq->worker_done); spin_lock_irq(&wq->hash->wait.lock); for_each_node(node) @@ -774,7 +774,8 @@ static int io_wq_fork_manager(struct io_wq *wq) if (wq->manager) return 0; - reinit_completion(&wq->worker_done); + init_completion(&wq->worker_done); + atomic_set(&wq->worker_refs, 1); tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE); if (!IS_ERR(tsk)) { wq->manager = get_task_struct(tsk); @@ -782,6 +783,9 @@ static int io_wq_fork_manager(struct io_wq *wq) return 0; } + if (atomic_dec_and_test(&wq->worker_refs)) + complete(&wq->worker_done); + return PTR_ERR(tsk); } @@ -1018,13 +1022,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) init_completion(&wq->exited); refcount_set(&wq->refs, 1); - init_completion(&wq->worker_done); - atomic_set(&wq->worker_refs, 0); - ret = io_wq_fork_manager(wq); if (!ret) return wq; - err: io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); -- cgit v1.2.3 From 003e8dccdb22712dae388e682182d5f08b32386f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 6 Mar 2021 09:22:27 -0700 Subject: io-wq: always track creds for async issue If we go async with a request, grab the creds that the task currently has assigned and make sure that the async side switches to them. This is handled in the same way that we do for registered personalities. Signed-off-by: Jens Axboe --- fs/io-wq.h | 2 +- fs/io_uring.c | 33 +++++++++++++++++++-------------- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.h b/fs/io-wq.h index 5fbf7997149e..1ac2f3248088 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -79,8 +79,8 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; + const struct cred *creds; unsigned flags; - unsigned short personality; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 92c25b5f1349..d51c6ba9268b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1183,6 +1183,9 @@ static void io_prep_async_work(struct io_kiocb *req) const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; + if (!req->work.creds) + req->work.creds = get_current_cred(); + if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -1648,6 +1651,10 @@ static void io_dismantle_req(struct io_kiocb *req) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); if (req->fixed_rsrc_refs) percpu_ref_put(req->fixed_rsrc_refs); + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -5916,18 +5923,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) const struct cred *creds = NULL; int ret; - if (req->work.personality) { - const struct cred *new_creds; - - if (!(issue_flags & IO_URING_F_NONBLOCK)) - mutex_lock(&ctx->uring_lock); - new_creds = idr_find(&ctx->personality_idr, req->work.personality); - if (!(issue_flags & IO_URING_F_NONBLOCK)) - mutex_unlock(&ctx->uring_lock); - if (!new_creds) - return -EINVAL; - creds = override_creds(new_creds); - } + if (req->work.creds && req->work.creds != current_cred()) + creds = override_creds(req->work.creds); switch (req->opcode) { case IORING_OP_NOP: @@ -6291,7 +6288,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, { struct io_submit_state *state; unsigned int sqe_flags; - int ret = 0; + int personality, ret = 0; req->opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -6324,8 +6321,16 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, return -EOPNOTSUPP; req->work.list.next = NULL; + personality = READ_ONCE(sqe->personality); + if (personality) { + req->work.creds = idr_find(&ctx->personality_idr, personality); + if (!req->work.creds) + return -EINVAL; + get_cred(req->work.creds); + } else { + req->work.creds = NULL; + } req->work.flags = 0; - req->work.personality = READ_ONCE(sqe->personality); state = &ctx->submit_state; /* -- cgit v1.2.3 From d30881f573e565ebb5dbb50b31ed6106b5c81328 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 18 Feb 2021 21:02:07 -0500 Subject: nfsd: Don't keep looking up unhashed files in the nfsd file cache If a file is unhashed, then we're going to reject it anyway and retry, so make sure we skip it when we're doing the RCU lockless lookup. This avoids a number of unnecessary nfserr_jukebox returns from nfsd_file_acquire() Fixes: 65294c1f2c5e ("nfsd: add a new struct file caching facility to nfsd") Signed-off-by: Trond Myklebust Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 53fcbf79bdca..7629248fdd53 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -898,6 +898,8 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, continue; if (!nfsd_match_cred(nf->nf_cred, current_cred())) continue; + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) + continue; if (nfsd_file_get(nf) != NULL) return nf; } -- cgit v1.2.3 From 7005227369079963d25fb2d5d736d0feb2c44cf6 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Fri, 19 Feb 2021 16:56:10 -0500 Subject: fs: nfsd: fix kconfig dependency warning for NFSD_V4 When NFSD_V4 is enabled and CRYPTO is disabled, Kbuild gives the following warning: WARNING: unmet direct dependencies detected for CRYPTO_SHA256 Depends on [n]: CRYPTO [=n] Selected by [y]: - NFSD_V4 [=y] && NETWORK_FILESYSTEMS [=y] && NFSD [=y] && PROC_FS [=y] WARNING: unmet direct dependencies detected for CRYPTO_MD5 Depends on [n]: CRYPTO [=n] Selected by [y]: - NFSD_V4 [=y] && NETWORK_FILESYSTEMS [=y] && NFSD [=y] && PROC_FS [=y] This is because NFSD_V4 selects CRYPTO_MD5 and CRYPTO_SHA256, without depending on or selecting CRYPTO, despite those config options being subordinate to CRYPTO. Signed-off-by: Julian Braha Signed-off-by: Chuck Lever --- fs/nfsd/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 821e5913faee..d6cff5fbe705 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -73,6 +73,7 @@ config NFSD_V4 select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS + select CRYPTO select CRYPTO_MD5 select CRYPTO_SHA256 select GRACE_PERIOD -- cgit v1.2.3 From bfdd89f232aa2de5a4b3fc985cba894148b830a8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 24 Feb 2021 13:39:50 -0500 Subject: nfsd: don't abort copies early The typical result of the backwards comparison here is that the source server in a server-to-server copy will return BAD_STATEID within a few seconds of the copy starting, instead of giving the copy a full lease period, so the copy_file_range() call will end up unnecessarily returning a short read. Fixes: 624322f1adc5 "NFSD add COPY_NOTIFY operation" Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 423fd6683f3a..61552e89bd89 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5389,7 +5389,7 @@ nfs4_laundromat(struct nfsd_net *nn) idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID && - cps->cpntf_time > cutoff) + cps->cpntf_time < cutoff) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); -- cgit v1.2.3 From eb602521f43876b3f76c4686de596c9804977228 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 4 Mar 2021 09:28:57 -0500 Subject: gfs2: make function gfs2_make_fs_ro() to void type It fixes the following warning detected by coccinelle: ./fs/gfs2/super.c:592:5-10: Unneeded variable: "error". Return "0" on line 628 Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/ops_fstype.c | 4 +--- fs/gfs2/super.c | 10 ++-------- fs/gfs2/super.h | 2 +- fs/gfs2/util.c | 2 +- 4 files changed, 5 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 74c7d01723b9..aa4136055a83 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1539,9 +1539,7 @@ static int gfs2_reconfigure(struct fs_context *fc) return -EINVAL; if (fc->sb_flags & SB_RDONLY) { - error = gfs2_make_fs_ro(sdp); - if (error) - errorfc(fc, "unable to remount read-only"); + gfs2_make_fs_ro(sdp); } else { error = gfs2_make_fs_rw(sdp); if (error) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 861ed5fe02a5..97076d3f562f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -587,9 +587,8 @@ out: * Returns: errno */ -int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +void gfs2_make_fs_ro(struct gfs2_sbd *sdp) { - int error = 0; int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); gfs2_flush_delete_work(sdp); @@ -624,8 +623,6 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) if (!log_write_allowed) sdp->sd_vfs->s_flags |= SB_RDONLY; - - return error; } /** @@ -637,7 +634,6 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) static void gfs2_put_super(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; - int error; struct gfs2_jdesc *jd; /* No more recovery requests */ @@ -658,9 +654,7 @@ restart: spin_unlock(&sdp->sd_jindex_spin); if (!sb_rdonly(sb)) { - error = gfs2_make_fs_ro(sdp); - if (error) - gfs2_io_error(sdp); + gfs2_make_fs_ro(sdp); } WARN_ON(gfs2_withdrawing(sdp)); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 08e502dec7ec..ec4affb33ed5 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -34,7 +34,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); -extern int gfs2_make_fs_ro(struct gfs2_sbd *sdp); +extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp); extern void gfs2_online_uevent(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 8d3c670c990f..58743315cda9 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -156,7 +156,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) ret = 0; } if (!ret) - ret = gfs2_make_fs_ro(sdp); + gfs2_make_fs_ro(sdp); gfs2_freeze_unlock(&freeze_gh); } -- cgit v1.2.3 From 1a5a2cfd34c17db73c53ef127272c8c1ae220485 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 25 Feb 2021 11:11:09 -0500 Subject: gfs2: fix use-after-free in trans_drain This patch adds code to function trans_drain to remove drained bd elements from the ail lists, if queued, before freeing the bd. If we don't remove the bd from the ail, function ail_drain will try to reference the bd after it has been freed by trans_drain. Thanks to Andy Price for his analysis of the problem. Reported-by: Andy Price Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 4 ++++ fs/gfs2/trans.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 16937ebb2a3e..760af666576c 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -998,12 +998,16 @@ static void trans_drain(struct gfs2_trans *tr) while (!list_empty(head)) { bd = list_first_entry(head, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); + if (!list_empty(&bd->bd_ail_st_list)) + gfs2_remove_from_ail(bd); kmem_cache_free(gfs2_bufdata_cachep, bd); } head = &tr->tr_databuf; while (!list_empty(head)) { bd = list_first_entry(head, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); + if (!list_empty(&bd->bd_ail_st_list)) + gfs2_remove_from_ail(bd); kmem_cache_free(gfs2_bufdata_cachep, bd); } } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ab96cf0bf26b..63fec11ef2ce 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -169,6 +169,8 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, bd->bd_bh = bh; bd->bd_gl = gl; INIT_LIST_HEAD(&bd->bd_list); + INIT_LIST_HEAD(&bd->bd_ail_st_list); + INIT_LIST_HEAD(&bd->bd_ail_gl_list); bh->b_private = bd; return bd; } -- cgit v1.2.3 From 2941267bd3dad018de1d51fe2cd996b7bc1e5a5d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:11 +0000 Subject: io_uring: make del_task_file more forgiving Rework io_uring_del_task_file(), so it accepts an index to delete, and it's not necessarily have to be in the ->xa. Infer file from xa_erase() to maintain a single origin of truth. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index d51c6ba9268b..00a736867b76 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8785,15 +8785,18 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) /* * Remove this io_uring_file -> task mapping. */ -static void io_uring_del_task_file(struct file *file) +static void io_uring_del_task_file(unsigned long index) { struct io_uring_task *tctx = current->io_uring; + struct file *file; + + file = xa_erase(&tctx->xa, index); + if (!file) + return; if (tctx->last == file) tctx->last = NULL; - file = xa_erase(&tctx->xa, (unsigned long)file); - if (file) - fput(file); + fput(file); } static void io_uring_clean_tctx(struct io_uring_task *tctx) @@ -8802,7 +8805,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) unsigned long index; xa_for_each(&tctx->xa, index, file) - io_uring_del_task_file(file); + io_uring_del_task_file(index); if (tctx->io_wq) { io_wq_put_and_exit(tctx->io_wq); tctx->io_wq = NULL; -- cgit v1.2.3 From 13bf43f5f4739739751c0049a1582610c283bdde Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:12 +0000 Subject: io_uring: introduce ctx to tctx back map For each pair tcxt-ctx create an object and chain it into ctx, so we have a way to traverse all tctx that are using current ctx. Preparation patch, will be used later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 58 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 00a736867b76..9a2cff0662e0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -454,6 +454,7 @@ struct io_ring_ctx { /* Keep this last, we don't need it for the fast path */ struct work_struct exit_work; + struct list_head tctx_list; }; /* @@ -805,6 +806,13 @@ struct io_kiocb { struct io_wq_work work; }; +struct io_tctx_node { + struct list_head ctx_node; + struct task_struct *task; + struct file *file; + struct io_ring_ctx *ctx; +}; + struct io_defer_entry { struct list_head list; struct io_kiocb *req; @@ -1144,6 +1152,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); + INIT_LIST_HEAD(&ctx->tctx_list); INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list); return ctx; @@ -8748,6 +8757,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) { struct io_uring_task *tctx = current->io_uring; + struct io_tctx_node *node; int ret; if (unlikely(!tctx)) { @@ -8760,13 +8770,25 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) void *old = xa_load(&tctx->xa, (unsigned long)file); if (!old) { + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + node->ctx = ctx; + node->file = file; + node->task = current; + get_file(file); ret = xa_err(xa_store(&tctx->xa, (unsigned long)file, - file, GFP_KERNEL)); + node, GFP_KERNEL)); if (ret) { fput(file); + kfree(node); return ret; } + + mutex_lock(&ctx->uring_lock); + list_add(&node->ctx_node, &ctx->tctx_list); + mutex_unlock(&ctx->uring_lock); } tctx->last = file; } @@ -8788,23 +8810,31 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) static void io_uring_del_task_file(unsigned long index) { struct io_uring_task *tctx = current->io_uring; - struct file *file; + struct io_tctx_node *node; - file = xa_erase(&tctx->xa, index); - if (!file) + node = xa_erase(&tctx->xa, index); + if (!node) return; - if (tctx->last == file) + WARN_ON_ONCE(current != node->task); + WARN_ON_ONCE(list_empty(&node->ctx_node)); + + mutex_lock(&node->ctx->uring_lock); + list_del(&node->ctx_node); + mutex_unlock(&node->ctx->uring_lock); + + if (tctx->last == node->file) tctx->last = NULL; - fput(file); + fput(node->file); + kfree(node); } static void io_uring_clean_tctx(struct io_uring_task *tctx) { - struct file *file; + struct io_tctx_node *node; unsigned long index; - xa_for_each(&tctx->xa, index, file) + xa_for_each(&tctx->xa, index, node) io_uring_del_task_file(index); if (tctx->io_wq) { io_wq_put_and_exit(tctx->io_wq); @@ -8815,13 +8845,13 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) void __io_uring_files_cancel(struct files_struct *files) { struct io_uring_task *tctx = current->io_uring; - struct file *file; + struct io_tctx_node *node; unsigned long index; /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); - xa_for_each(&tctx->xa, index, file) - io_uring_cancel_task_requests(file->private_data, files); + xa_for_each(&tctx->xa, index, node) + io_uring_cancel_task_requests(node->ctx, files); atomic_dec(&tctx->in_idle); if (files) @@ -8884,11 +8914,11 @@ void __io_uring_task_cancel(void) atomic_inc(&tctx->in_idle); if (tctx->sqpoll) { - struct file *file; + struct io_tctx_node *node; unsigned long index; - xa_for_each(&tctx->xa, index, file) - io_uring_cancel_sqpoll(file->private_data); + xa_for_each(&tctx->xa, index, node) + io_uring_cancel_sqpoll(node->ctx); } do { -- cgit v1.2.3 From d56d938b4bef3e1421a42023cdcd6e13c1f50831 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:13 +0000 Subject: io_uring: do ctx initiated file note removal Another preparation patch. When full quiesce is done on ctx exit, use task_work infra to remove corresponding to the ctx io_uring->xa entries. For that we use the back tctx map. Also use ->in_idle to prevent removing it while we traversing ->xa on cancellation, just ignore it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9a2cff0662e0..8a4ab86ae64f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -987,6 +987,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_UNLINKAT] = {}, }; +static void io_uring_del_task_file(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files); @@ -8536,10 +8537,33 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) return executed; } +struct io_tctx_exit { + struct callback_head task_work; + struct completion completion; + unsigned long index; +}; + +static void io_tctx_exit_cb(struct callback_head *cb) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_tctx_exit *work; + + work = container_of(cb, struct io_tctx_exit, task_work); + /* + * When @in_idle, we're in cancellation and it's racy to remove the + * node. It'll be removed by the end of cancellation, just ignore it. + */ + if (!atomic_read(&tctx->in_idle)) + io_uring_del_task_file(work->index); + complete(&work->completion); +} + static void io_ring_exit_work(struct work_struct *work) { - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - exit_work); + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); + struct io_tctx_exit exit; + struct io_tctx_node *node; + int ret; /* * If we're doing polled IO and end up having requests being @@ -8550,6 +8574,26 @@ static void io_ring_exit_work(struct work_struct *work) do { io_uring_try_cancel_requests(ctx, NULL, NULL); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); + + mutex_lock(&ctx->uring_lock); + while (!list_empty(&ctx->tctx_list)) { + node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, + ctx_node); + exit.index = (unsigned long)node->file; + init_completion(&exit.completion); + init_task_work(&exit.task_work, io_tctx_exit_cb); + ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); + if (WARN_ON_ONCE(ret)) + continue; + wake_up_process(node->task); + + mutex_unlock(&ctx->uring_lock); + wait_for_completion(&exit.completion); + cond_resched(); + mutex_lock(&ctx->uring_lock); + } + mutex_unlock(&ctx->uring_lock); + io_ring_ctx_free(ctx); } -- cgit v1.2.3 From eebd2e37e662617a6b8041db75205f0a262ce870 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:14 +0000 Subject: io_uring: don't take task ring-file notes With ->flush() gone we're now leaving all uring file notes until the task dies/execs, so the ctx will not be freed until all tasks that have ever submit a request die. It was nicer with flush but not much, we could have locked as described ctx in many cases. Now we guarantee that ctx outlives all tctx in a sense that io_ring_exit_work() waits for all tctxs to drop their corresponding enties in ->xa, and ctx won't go away until then. Hence, additional io_uring file reference (a.k.a. task file notes) are not needed anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a4ab86ae64f..f448213267c8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8821,11 +8821,9 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) node->file = file; node->task = current; - get_file(file); ret = xa_err(xa_store(&tctx->xa, (unsigned long)file, node, GFP_KERNEL)); if (ret) { - fput(file); kfree(node); return ret; } @@ -8856,6 +8854,8 @@ static void io_uring_del_task_file(unsigned long index) struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; + if (!tctx) + return; node = xa_erase(&tctx->xa, index); if (!node) return; @@ -8869,7 +8869,6 @@ static void io_uring_del_task_file(unsigned long index) if (tctx->last == node->file) tctx->last = NULL; - fput(node->file); kfree(node); } -- cgit v1.2.3 From baf186c4d345f5a105e63df01100936ad622f369 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:15 +0000 Subject: io_uring: index io_uring->xa by ctx not file We don't use task file notes anymore, and no need left in indexing task->io_uring->xa by file, and replace it with ctx. It's better design-wise, especially since we keep a dangling file, and so have to keep an eye on not dereferencing it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 +++++++++++------------- include/linux/io_uring.h | 2 +- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index f448213267c8..01a7fa4a4889 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -809,7 +809,6 @@ struct io_kiocb { struct io_tctx_node { struct list_head ctx_node; struct task_struct *task; - struct file *file; struct io_ring_ctx *ctx; }; @@ -8540,7 +8539,7 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) struct io_tctx_exit { struct callback_head task_work; struct completion completion; - unsigned long index; + struct io_ring_ctx *ctx; }; static void io_tctx_exit_cb(struct callback_head *cb) @@ -8554,7 +8553,7 @@ static void io_tctx_exit_cb(struct callback_head *cb) * node. It'll be removed by the end of cancellation, just ignore it. */ if (!atomic_read(&tctx->in_idle)) - io_uring_del_task_file(work->index); + io_uring_del_task_file((unsigned long)work->ctx); complete(&work->completion); } @@ -8579,7 +8578,7 @@ static void io_ring_exit_work(struct work_struct *work) while (!list_empty(&ctx->tctx_list)) { node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, ctx_node); - exit.index = (unsigned long)node->file; + exit.ctx = ctx; init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); @@ -8798,7 +8797,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_add_task_file(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -8810,18 +8809,17 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) return ret; tctx = current->io_uring; } - if (tctx->last != file) { - void *old = xa_load(&tctx->xa, (unsigned long)file); + if (tctx->last != ctx) { + void *old = xa_load(&tctx->xa, (unsigned long)ctx); if (!old) { node = kmalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->ctx = ctx; - node->file = file; node->task = current; - ret = xa_err(xa_store(&tctx->xa, (unsigned long)file, + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, node, GFP_KERNEL)); if (ret) { kfree(node); @@ -8832,7 +8830,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) list_add(&node->ctx_node, &ctx->tctx_list); mutex_unlock(&ctx->uring_lock); } - tctx->last = file; + tctx->last = ctx; } /* @@ -8867,7 +8865,7 @@ static void io_uring_del_task_file(unsigned long index) list_del(&node->ctx_node); mutex_unlock(&node->ctx->uring_lock); - if (tctx->last == node->file) + if (tctx->last == node->ctx) tctx->last = NULL; kfree(node); } @@ -9166,7 +9164,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_task_file(ctx, f.file); + ret = io_uring_add_task_file(ctx); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -9375,7 +9373,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) if (fd < 0) return fd; - ret = io_uring_add_task_file(ctx, file); + ret = io_uring_add_task_file(ctx); if (ret) { put_unused_fd(fd); return ret; diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 7cb7bd0e334c..9761a0ec9f95 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -18,7 +18,7 @@ struct io_uring_task { /* submission side */ struct xarray xa; struct wait_queue_head wait; - struct file *last; + void *last; void *io_wq; struct percpu_counter inflight; atomic_t in_idle; -- cgit v1.2.3 From b5bb3a24f69da92e0ec2a301452364333e45be03 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:16 +0000 Subject: io_uring: warn when ring exit takes too long We use system_unbound_wq to run io_ring_exit_work(), so it's hard to monitor whether removal hang or not. Add WARN_ONCE to catch hangs. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 01a7fa4a4889..945e54690b81 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8560,6 +8560,7 @@ static void io_tctx_exit_cb(struct callback_head *cb) static void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); + unsigned long timeout = jiffies + HZ * 60 * 5; struct io_tctx_exit exit; struct io_tctx_node *node; int ret; @@ -8572,10 +8573,14 @@ static void io_ring_exit_work(struct work_struct *work) */ do { io_uring_try_cancel_requests(ctx, NULL, NULL); + + WARN_ON_ONCE(time_after(jiffies, timeout)); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); mutex_lock(&ctx->uring_lock); while (!list_empty(&ctx->tctx_list)) { + WARN_ON_ONCE(time_after(jiffies, timeout)); + node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, ctx_node); exit.ctx = ctx; -- cgit v1.2.3 From 1b00764f09b6912d25e188d972a7764a457926ba Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:17 +0000 Subject: io_uring: cancel reqs of all iowq's on ring exit io_ring_exit_work() have to cancel all requests, including those staying in io-wq, however it tries only cancellation of current tctx, which is NULL. If we've got task==NULL, use the ctx-to-tctx map to go over all tctx/io-wq and try cancellations on them. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 945e54690b81..8c74c7799960 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8688,19 +8688,55 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } +static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->ctx == data; +} + +static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) +{ + struct io_tctx_node *node; + enum io_wq_cancel cret; + bool ret = false; + + mutex_lock(&ctx->uring_lock); + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + /* + * io_wq will stay alive while we hold uring_lock, because it's + * killed after ctx nodes, which requires to take the lock. + */ + if (!tctx || !tctx->io_wq) + continue; + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + mutex_unlock(&ctx->uring_lock); + + return ret; +} + static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) { struct io_task_cancel cancel = { .task = task, .files = files, }; - struct task_struct *tctx_task = task ?: current; - struct io_uring_task *tctx = tctx_task->io_uring; + struct io_uring_task *tctx = task ? task->io_uring : NULL; while (1) { enum io_wq_cancel cret; bool ret = false; - if (tctx && tctx->io_wq) { + if (!task) { + ret |= io_uring_try_cancel_iowq(ctx); + } else if (tctx && tctx->io_wq) { + /* + * Cancels requests of all rings, not only @ctx, but + * it's fine as the task is in exit/exec. + */ cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, &cancel, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); -- cgit v1.2.3 From 678eeba481d8c161203382832a4379d507050aed Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 6 Mar 2021 11:02:18 +0000 Subject: io-wq: warn on creating manager while exiting Add a simple warning making sure that nobody tries to create a new manager while we're under IO_WQ_BIT_EXIT. That can potentially happen due to racy work submission after final put. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 1bfdb86336e4..1ab9324e602f 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -774,6 +774,8 @@ static int io_wq_fork_manager(struct io_wq *wq) if (wq->manager) return 0; + WARN_ON_ONCE(test_bit(IO_WQ_BIT_EXIT, &wq->state)); + init_completion(&wq->worker_done); atomic_set(&wq->worker_refs, 1); tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE); -- cgit v1.2.3 From 7c30f36a98ae488741178d69662e4f2baa53e7f6 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Sun, 7 Mar 2021 11:54:28 +0100 Subject: io_uring: run __io_sq_thread() with the initial creds from io_uring_setup() With IORING_SETUP_ATTACH_WQ we should let __io_sq_thread() use the initial creds from each ctx. Signed-off-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8c74c7799960..4d3333ca27a3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -380,6 +380,7 @@ struct io_ring_ctx { /* Only used for accounting purposes */ struct mm_struct *mm_account; + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ struct wait_queue_head sqo_sq_wait; @@ -6719,7 +6720,13 @@ static int io_sq_thread(void *data) sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + const struct cred *creds = NULL; + + if (ctx->sq_creds != current_cred()) + creds = override_creds(ctx->sq_creds); ret = __io_sq_thread(ctx, cap_entries); + if (creds) + revert_creds(creds); if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; } @@ -7152,6 +7159,8 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) io_put_sq_data(sqd); ctx->sq_data = NULL; + if (ctx->sq_creds) + put_cred(ctx->sq_creds); } } @@ -7890,6 +7899,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } + ctx->sq_creds = get_current_cred(); ctx->sq_data = sqd; io_sq_thread_park(sqd); mutex_lock(&sqd->ctx_lock); -- cgit v1.2.3 From 041474885e9707a38fad081abe30159eb6d463f9 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Sun, 7 Mar 2021 11:54:29 +0100 Subject: io_uring: kill io_sq_thread_fork() and return -EOWNERDEAD if the sq_thread is gone This brings the behavior back in line with what 5.11 and earlier did, and this is no longer needed with the improved handling of creds not needing to do unshare(). Signed-off-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4d3333ca27a3..7cf96be691d8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -336,7 +336,6 @@ struct io_ring_ctx { unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; - unsigned int sqo_exec: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -6786,7 +6785,6 @@ static int io_sq_thread(void *data) sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - ctx->sqo_exec = 1; io_ring_set_wakeup_flag(ctx); } @@ -7846,26 +7844,6 @@ void __io_uring_free(struct task_struct *tsk) tsk->io_uring = NULL; } -static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx) -{ - struct task_struct *tsk; - int ret; - - clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - reinit_completion(&sqd->parked); - ctx->sqo_exec = 0; - sqd->task_pid = current->pid; - tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); - if (IS_ERR(tsk)) - return PTR_ERR(tsk); - ret = io_uring_alloc_task_context(tsk, ctx); - if (ret) - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - sqd->thread = tsk; - wake_up_new_task(tsk); - return ret; -} - static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { @@ -9199,13 +9177,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL); - if (unlikely(ctx->sqo_exec)) { - ret = io_sq_thread_fork(ctx->sq_data, ctx); - if (ret) - goto out; - ctx->sqo_exec = 0; - } ret = -EOWNERDEAD; + if (unlikely(ctx->sq_data->thread == NULL)) { + goto out; + } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) { -- cgit v1.2.3 From 9f377622a484de0818c49ee01e0ab4eedf6acd81 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 6 Mar 2021 12:04:38 +0800 Subject: erofs: fix bio->bi_max_vecs behavior change Martin reported an issue that directory read could be hung on the latest -rc kernel with some certain image. The root cause is that commit baa2c7c97153 ("block: set .bi_max_vecs as actual allocated vector number") changes .bi_max_vecs behavior. bio->bi_max_vecs is set as actual allocated vector number rather than the requested number now. Let's avoid using .bi_max_vecs completely instead. Link: https://lore.kernel.org/r/20210306040438.8084-1-hsiangkao@aol.com Reported-by: Martin DEVERA Reviewed-by: Chao Yu [ Gao Xiang: note that <= 5.11 kernels are not impacted. ] Signed-off-by: Gao Xiang --- fs/erofs/data.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f88851c5c250..1249e74b3bf0 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -129,6 +129,7 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio, struct page *page, erofs_off_t *last_block, unsigned int nblocks, + unsigned int *eblks, bool ra) { struct inode *const inode = mapping->host; @@ -145,8 +146,7 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio, /* note that for readpage case, bio also equals to NULL */ if (bio && - /* not continuous */ - *last_block + 1 != current_block) { + (*last_block + 1 != current_block || !*eblks)) { submit_bio_retry: submit_bio(bio); bio = NULL; @@ -216,7 +216,8 @@ submit_bio_retry: if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); - bio = bio_alloc(GFP_NOIO, bio_max_segs(nblocks)); + *eblks = bio_max_segs(nblocks); + bio = bio_alloc(GFP_NOIO, *eblks); bio->bi_end_io = erofs_readendio; bio_set_dev(bio, sb->s_bdev); @@ -229,16 +230,8 @@ submit_bio_retry: /* out of the extent or bio is full */ if (err < PAGE_SIZE) goto submit_bio_retry; - + --*eblks; *last_block = current_block; - - /* shift in advance in case of it followed by too many gaps */ - if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) { - /* err should reassign to 0 after submitting */ - err = 0; - goto submit_bio_out; - } - return bio; err_out: @@ -252,7 +245,6 @@ has_updated: /* if updated manually, continuous pages has a gap */ if (bio) -submit_bio_out: submit_bio(bio); return err ? ERR_PTR(err) : NULL; } @@ -264,23 +256,26 @@ submit_bio_out: static int erofs_raw_access_readpage(struct file *file, struct page *page) { erofs_off_t last_block; + unsigned int eblks; struct bio *bio; trace_erofs_readpage(page, true); bio = erofs_read_raw_page(NULL, page->mapping, - page, &last_block, 1, false); + page, &last_block, 1, &eblks, false); if (IS_ERR(bio)) return PTR_ERR(bio); - DBG_BUGON(bio); /* since we have only one bio -- must be NULL */ + if (bio) + submit_bio(bio); return 0; } static void erofs_raw_access_readahead(struct readahead_control *rac) { erofs_off_t last_block; + unsigned int eblks; struct bio *bio = NULL; struct page *page; @@ -291,7 +286,7 @@ static void erofs_raw_access_readahead(struct readahead_control *rac) prefetchw(&page->flags); bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block, - readahead_count(rac), true); + readahead_count(rac), &eblks, true); /* all the page errors are ignored when readahead */ if (IS_ERR(bio)) { @@ -305,7 +300,6 @@ static void erofs_raw_access_readahead(struct readahead_control *rac) put_page(page); } - /* the rare case (end in gaps) */ if (bio) submit_bio(bio); } -- cgit v1.2.3 From d0ed78e1780eb3738f9c106fbaff6a1181017cd3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 1 Mar 2021 21:02:49 +0100 Subject: s390,alpha: make TMPFS_INODE64 available again Both s390 and alpha have been switched to 64-bit ino_t with commit 96c0a6a72d18 ("s390,alpha: switch to 64-bit ino_t"). Therefore enable TMPFS_INODE64 for both architectures again. Cc: Richard Henderson Cc: Ivan Kokshaysky Link: https://lore.kernel.org/linux-mm/YCV7QiyoweJwvN+m@osiris/ Signed-off-by: Heiko Carstens --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 462253ae483a..a55bda4233bb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -203,7 +203,7 @@ config TMPFS_XATTR config TMPFS_INODE64 bool "Use 64-bit ino_t by default in tmpfs" - depends on TMPFS && 64BIT && !(S390 || ALPHA) + depends on TMPFS && 64BIT default n help tmpfs has historically used only inode numbers as wide as an unsigned -- cgit v1.2.3 From ee2e3f50629f17b0752b55b2566c15ce8dafb557 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 6 Mar 2021 11:10:10 +0100 Subject: mount: fix mounting of detached mounts onto targets that reside on shared mounts Creating a series of detached mounts, attaching them to the filesystem, and unmounting them can be used to trigger an integer overflow in ns->mounts causing the kernel to block any new mounts in count_mounts() and returning ENOSPC because it falsely assumes that the maximum number of mounts in the mount namespace has been reached, i.e. it thinks it can't fit the new mounts into the mount namespace anymore. Depending on the number of mounts in your system, this can be reproduced on any kernel that supportes open_tree() and move_mount() by compiling and running the following program: /* SPDX-License-Identifier: LGPL-2.1+ */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include /* open_tree() */ #ifndef OPEN_TREE_CLONE #define OPEN_TREE_CLONE 1 #endif #ifndef OPEN_TREE_CLOEXEC #define OPEN_TREE_CLOEXEC O_CLOEXEC #endif #ifndef __NR_open_tree #if defined __alpha__ #define __NR_open_tree 538 #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_open_tree 4428 #endif #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ #define __NR_open_tree 6428 #endif #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ #define __NR_open_tree 5428 #endif #elif defined __ia64__ #define __NR_open_tree (428 + 1024) #else #define __NR_open_tree 428 #endif #endif /* move_mount() */ #ifndef MOVE_MOUNT_F_EMPTY_PATH #define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ #endif #ifndef __NR_move_mount #if defined __alpha__ #define __NR_move_mount 539 #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_move_mount 4429 #endif #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ #define __NR_move_mount 6429 #endif #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ #define __NR_move_mount 5429 #endif #elif defined __ia64__ #define __NR_move_mount (428 + 1024) #else #define __NR_move_mount 429 #endif #endif static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) { return syscall(__NR_open_tree, dfd, filename, flags); } static inline int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, unsigned int flags) { return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags); } static bool is_shared_mountpoint(const char *path) { bool shared = false; FILE *f = NULL; char *line = NULL; int i; size_t len = 0; f = fopen("/proc/self/mountinfo", "re"); if (!f) return 0; while (getline(&line, &len, f) > 0) { char *slider1, *slider2; for (slider1 = line, i = 0; slider1 && i < 4; i++) slider1 = strchr(slider1 + 1, ' '); if (!slider1) continue; slider2 = strchr(slider1 + 1, ' '); if (!slider2) continue; *slider2 = '\0'; if (strcmp(slider1 + 1, path) == 0) { /* This is the path. Is it shared? */ slider1 = strchr(slider2 + 1, ' '); if (slider1 && strstr(slider1, "shared:")) { shared = true; break; } } } fclose(f); free(line); return shared; } static void usage(void) { const char *text = "mount-new [--recursive] \n"; fprintf(stderr, "%s", text); _exit(EXIT_SUCCESS); } #define exit_usage(format, ...) \ ({ \ fprintf(stderr, format "\n", ##__VA_ARGS__); \ usage(); \ }) #define exit_log(format, ...) \ ({ \ fprintf(stderr, format "\n", ##__VA_ARGS__); \ exit(EXIT_FAILURE); \ }) static const struct option longopts[] = { {"help", no_argument, 0, 'a'}, { NULL, no_argument, 0, 0 }, }; int main(int argc, char *argv[]) { int exit_code = EXIT_SUCCESS, index = 0; int dfd, fd_tree, new_argc, ret; char *base_dir; char *const *new_argv; char target[PATH_MAX]; while ((ret = getopt_long_only(argc, argv, "", longopts, &index)) != -1) { switch (ret) { case 'a': /* fallthrough */ default: usage(); } } new_argv = &argv[optind]; new_argc = argc - optind; if (new_argc < 1) exit_usage("Missing base directory\n"); base_dir = new_argv[0]; if (*base_dir != '/') exit_log("Please specify an absolute path"); /* Ensure that target is a shared mountpoint. */ if (!is_shared_mountpoint(base_dir)) exit_log("Please ensure that \"%s\" is a shared mountpoint", base_dir); dfd = open(base_dir, O_RDONLY | O_DIRECTORY | O_CLOEXEC); if (dfd < 0) exit_log("%m - Failed to open base directory \"%s\"", base_dir); ret = mkdirat(dfd, "detached-move-mount", 0755); if (ret < 0) exit_log("%m - Failed to create required temporary directories"); ret = snprintf(target, sizeof(target), "%s/detached-move-mount", base_dir); if (ret < 0 || (size_t)ret >= sizeof(target)) exit_log("%m - Failed to assemble target path"); /* * Having a mount table with 10000 mounts is already quite excessive * and shoult account even for weird test systems. */ for (size_t i = 0; i < 10000; i++) { fd_tree = sys_open_tree(dfd, "detached-move-mount", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH); if (fd_tree < 0) { fprintf(stderr, "%m - Failed to open %d(detached-move-mount)", dfd); exit_code = EXIT_FAILURE; break; } ret = sys_move_mount(fd_tree, "", dfd, "detached-move-mount", MOVE_MOUNT_F_EMPTY_PATH); if (ret < 0) { if (errno == ENOSPC) fprintf(stderr, "%m - Buggy mount counting"); else fprintf(stderr, "%m - Failed to attach mount to %d(detached-move-mount)", dfd); exit_code = EXIT_FAILURE; break; } close(fd_tree); ret = umount2(target, MNT_DETACH); if (ret < 0) { fprintf(stderr, "%m - Failed to unmount %s", target); exit_code = EXIT_FAILURE; break; } } (void)unlinkat(dfd, "detached-move-mount", AT_REMOVEDIR); close(dfd); exit(exit_code); } and wait for the kernel to refuse any new mounts by returning ENOSPC. How many iterations are needed depends on the number of mounts in your system. Assuming you have something like 50 mounts on a standard system it should be almost instantaneous. The root cause of this is that detached mounts aren't handled correctly when source and target mount are identical and reside on a shared mount causing a broken mount tree where the detached source itself is propagated which propagation prevents for regular bind-mounts and new mounts. This ultimately leads to a miscalculation of the number of mounts in the mount namespace. Detached mounts created via open_tree(fd, path, OPEN_TREE_CLONE) are essentially like an unattached new mount, or an unattached bind-mount. They can then later on be attached to the filesystem via move_mount() which calls into attach_recursive_mount(). Part of attaching it to the filesystem is making sure that mounts get correctly propagated in case the destination mountpoint is MS_SHARED, i.e. is a shared mountpoint. This is done by calling into propagate_mnt() which walks the list of peers calling propagate_one() on each mount in this list making sure it receives the propagation event. The propagate_one() functions thereby skips both new mounts and bind mounts to not propagate them "into themselves". Both are identified by checking whether the mount is already attached to any mount namespace in mnt->mnt_ns. The is what the IS_MNT_NEW() helper is responsible for. However, detached mounts have an anonymous mount namespace attached to them stashed in mnt->mnt_ns which means that IS_MNT_NEW() doesn't realize they need to be skipped causing the mount to propagate "into itself" breaking the mount table and causing a disconnect between the number of mounts recorded as being beneath or reachable from the target mountpoint and the number of mounts actually recorded/counted in ns->mounts ultimately causing an overflow which in turn prevents any new mounts via the ENOSPC issue. So teach propagation to handle detached mounts by making it aware of them. I've been tracking this issue down for the last couple of days and then verifying that the fix is correct by unmounting everything in my current mount table leaving only /proc and /sys mounted and running the reproducer above overnight verifying the number of mounts counted in ns->mounts. With this fix the counts are correct and the ENOSPC issue can't be reproduced. This change will only have an effect on mounts created with the new mount API since detached mounts cannot be created with the old mount API so regressions are extremely unlikely. Link: https://lore.kernel.org/r/20210306101010.243666-1-christian.brauner@ubuntu.com Fixes: 2db154b3ea8e ("vfs: syscall: Add move_mount(2) to move mounts around") Cc: David Howells Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Cc: Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/pnode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pnode.h b/fs/pnode.h index 26f74e092bd9..988f1aa9b02a 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -12,7 +12,7 @@ #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) -#define IS_MNT_NEW(m) (!(m)->mnt_ns) +#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns)) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) -- cgit v1.2.3 From a0590473c5e6c4ef17c3132ad08fbad170f72d55 Mon Sep 17 00:00:00 2001 From: Timo Rothenpieler Date: Tue, 23 Feb 2021 15:19:01 +0100 Subject: nfs: fix PNFS_FLEXFILE_LAYOUT Kconfig default This follows what was done in 8c2fabc6542d9d0f8b16bd1045c2eda59bdcde13. With the default being m, it's impossible to build the module into the kernel. Signed-off-by: Timo Rothenpieler Signed-off-by: Anna Schumaker --- fs/nfs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index e2a488d403a6..14a72224b657 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -127,7 +127,7 @@ config PNFS_BLOCK config PNFS_FLEXFILE_LAYOUT tristate depends on NFS_V4_1 && NFS_V3 - default m + default NFS_V4 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN string "NFSv4.1 Implementation ID Domain" -- cgit v1.2.3 From ad3dbe35c833c2d4d0bbf3f04c785d32f931e7c9 Mon Sep 17 00:00:00 2001 From: Frank Sorenson Date: Mon, 8 Mar 2021 12:12:13 -0600 Subject: NFS: Correct size calculation for create reply length CREATE requests return a post_op_fh3, rather than nfs_fh3. The post_op_fh3 includes an extra word to indicate 'handle_follows'. Without that additional word, create fails when full 64-byte filehandles are in use. Add NFS3_post_op_fh_sz, and correct the size calculation for NFS3_createres_sz. Signed-off-by: Frank Sorenson Signed-off-by: Anna Schumaker --- fs/nfs/nfs3xdr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index ca10072644ff..ed1c83738c30 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -36,6 +36,7 @@ #define NFS3_pagepad_sz (1) /* Page padding */ #define NFS3_fhandle_sz (1+16) #define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */ +#define NFS3_post_op_fh_sz (1+NFS3_fh_sz) #define NFS3_sattr_sz (15) #define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) #define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) @@ -73,7 +74,7 @@ #define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+NFS3_pagepad_sz) #define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+NFS3_pagepad_sz) #define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) -#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) +#define NFS3_createres_sz (1+NFS3_post_op_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) #define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+NFS3_pagepad_sz) -- cgit v1.2.3 From 82e7ca1334ab16e2e04fafded1cab9dfcdc11b40 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Mar 2021 14:42:51 -0500 Subject: NFS: Don't revalidate the directory permissions on a lookup failure There should be no reason to expect the directory permissions to change just because the directory contents changed or a negative lookup timed out. So let's avoid doing a full call to nfs_mark_for_revalidate() in that case. Furthermore, if this is a negative dentry, and we haven't actually done a new lookup, then we have no reason yet to believe the directory has changed at all. So let's remove the gratuitous directory inode invalidation altogether when called from nfs_lookup_revalidate_negative(). Reported-by: Geert Jansen Fixes: 5ceb9d7fdaaf ("NFS: Refactor nfs_lookup_revalidate()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 19a9f434442f..08b162de627f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1401,6 +1401,15 @@ out_force: goto out; } +static void nfs_mark_dir_for_revalidate(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&inode->i_lock); + nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; + spin_unlock(&inode->i_lock); +} + /* * We judge how long we want to trust negative * dentries by looking at the parent inode mtime. @@ -1435,7 +1444,6 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, __func__, dentry); return 1; case 0: - nfs_mark_for_revalidate(dir); if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ nfs_zap_caches(inode); @@ -1525,6 +1533,13 @@ out: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); nfs4_label_free(label); + + /* + * If the lookup failed despite the dentry change attribute being + * a match, then we should revalidate the directory cache. + */ + if (!ret && nfs_verify_change_attribute(dir, dentry->d_time)) + nfs_mark_dir_for_revalidate(dir); return nfs_lookup_revalidate_done(dir, dentry, inode, ret); } @@ -1567,7 +1582,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, error = nfs_lookup_verify_inode(inode, flags); if (error) { if (error == -ESTALE) - nfs_zap_caches(dir); + nfs_mark_dir_for_revalidate(dir); goto out_bad; } nfs_advise_use_readdirplus(dir); @@ -2064,7 +2079,6 @@ out: dput(parent); return d; out_error: - nfs_mark_for_revalidate(dir); d = ERR_PTR(error); goto out; } -- cgit v1.2.3 From 47397915ede0192235474b145ebcd81b37b03624 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Mar 2021 14:42:52 -0500 Subject: NFS: Don't gratuitously clear the inode cache when lookup failed The fact that the lookup revalidation failed, does not mean that the inode contents have changed. Fixes: 5ceb9d7fdaaf ("NFS: Refactor nfs_lookup_revalidate()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 08b162de627f..a91f324cca49 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1444,18 +1444,14 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, __func__, dentry); return 1; case 0: - if (inode && S_ISDIR(inode->i_mode)) { - /* Purge readdir caches. */ - nfs_zap_caches(inode); - /* - * We can't d_drop the root of a disconnected tree: - * its d_hash is on the s_anon list and d_drop() would hide - * it from shrink_dcache_for_unmount(), leading to busy - * inodes on unmount and further oopses. - */ - if (IS_ROOT(dentry)) - return 1; - } + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (inode && IS_ROOT(dentry)) + return 1; dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", __func__, dentry); return 0; -- cgit v1.2.3 From fd6d3feed041e96b84680d0bfc1e7abc8f65de92 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Mar 2021 14:42:53 -0500 Subject: NFS: Clean up function nfs_mark_dir_for_revalidate() Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 4 +--- fs/nfs/inode.c | 2 +- fs/nfs/internal.h | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a91f324cca49..02ac982846f4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1403,10 +1403,8 @@ out_force: static void nfs_mark_dir_for_revalidate(struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; + nfs_set_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE); spin_unlock(&inode->i_lock); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 749bbea14d99..d21bfaac10b0 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -207,7 +207,7 @@ static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) } #endif -static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); bool have_delegation = NFS_PROTO(inode)->have_delegation(inode, FMODE_READ); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 25fb43b69e5a..7b644d6c09e4 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -411,7 +411,8 @@ extern int nfs_write_inode(struct inode *, struct writeback_control *); extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); -void nfs_zap_acl_cache(struct inode *inode); +extern void nfs_zap_acl_cache(struct inode *inode); +extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode); -- cgit v1.2.3 From ac46b3d768e4c2754f7b191b81e1bea582e11907 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Mar 2021 14:42:54 -0500 Subject: NFS: Fix open coded versions of nfs_set_cache_invalid() nfs_set_cache_invalid() has code to handle delegations, and other optimisations, so let's use it when appropriate. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 20 ++++++++++---------- fs/nfs/inode.c | 4 ++-- fs/nfs/unlink.c | 6 +++--- fs/nfs/write.c | 8 ++++---- 4 files changed, 19 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 02ac982846f4..fc4f490f2d78 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -81,8 +81,9 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA | - NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(dir, + NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED); list_add(&ctx->list, &nfsi->open_files); spin_unlock(&dir->i_lock); return ctx; @@ -1700,10 +1701,9 @@ static void nfs_drop_nlink(struct inode *inode) if (inode->i_nlink > 0) drop_nlink(inode); NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME - | NFS_INO_INVALID_OTHER - | NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid( + inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); } @@ -1715,7 +1715,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) { if (S_ISDIR(inode->i_mode)) /* drop any readdir cache as it could easily be old */ - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { nfs_complete_unlink(dentry, inode); @@ -2481,9 +2481,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, if (error == 0) { spin_lock(&old_inode->i_lock); NFS_I(old_inode)->attr_gencount = nfs_inc_attr_generation_counter(); - NFS_I(old_inode)->cache_validity |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME - | NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(old_inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_REVAL_FORCED); spin_unlock(&old_inode->i_lock); } out: diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d21bfaac10b0..eb1ae77f411a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1067,8 +1067,8 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) spin_lock(&inode->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA | - NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED); list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index b27ebdccef70..5fa11e1aca4c 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -500,9 +500,9 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME - | NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); d_move(dentry, sdentry); break; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 82bdcb982186..f05a90338a76 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -303,9 +303,9 @@ static void nfs_set_pageerror(struct address_space *mapping) nfs_zap_mapping(mapping->host, mapping); /* Force file size revalidation */ spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED | - NFS_INO_REVAL_PAGECACHE | - NFS_INO_INVALID_SIZE; + nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED | + NFS_INO_REVAL_PAGECACHE | + NFS_INO_INVALID_SIZE); spin_unlock(&inode->i_lock); } @@ -1604,7 +1604,7 @@ static int nfs_writeback_done(struct rpc_task *task, /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); spin_unlock(&inode->i_lock); } return 0; -- cgit v1.2.3 From b6f80a2ebb97f184c4679518ac83074598bf9bf4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 8 Mar 2021 14:42:55 -0500 Subject: NFS: Fix open coded versions of nfs_set_cache_invalid() in NFSv4 nfs_set_cache_invalid() has code to handle delegations, and other optimisations, so let's use it when appropriate. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/inode.c | 1 + fs/nfs/nfs42proc.c | 12 +++++++----- fs/nfs/nfs4proc.c | 28 ++++++++++++---------------- 3 files changed, 20 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index eb1ae77f411a..a7fb076a5f44 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -229,6 +229,7 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode); } +EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); /* * Invalidate the local caches diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index f3fd935620fc..094024b0aca1 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -357,13 +357,15 @@ static ssize_t _nfs42_proc_copy(struct file *src, truncate_pagecache_range(dst_inode, pos_dst, pos_dst + res->write_res.count); spin_lock(&dst_inode->i_lock); - NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | - NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE | - NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA); + nfs_set_cache_invalid( + dst_inode, NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED | + NFS_INO_INVALID_SIZE | NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_DATA); spin_unlock(&dst_inode->i_lock); spin_lock(&src_inode->i_lock); - NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | - NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME); + nfs_set_cache_invalid(src_inode, NFS_INO_REVAL_PAGECACHE | + NFS_INO_REVAL_FORCED | + NFS_INO_INVALID_ATIME); spin_unlock(&src_inode->i_lock); status = res->write_res.count; out: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 74bc5120013d..6d8fc56e5f45 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1169,14 +1169,14 @@ int nfs4_call_sync(struct rpc_clnt *clnt, static void nfs4_inc_nlink_locked(struct inode *inode) { - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); inc_nlink(inode); } static void nfs4_dec_nlink_locked(struct inode *inode) { - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); drop_nlink(inode); } @@ -1187,35 +1187,31 @@ nfs4_update_changeattr_locked(struct inode *inode, { struct nfs_inode *nfsi = NFS_I(inode); - nfsi->cache_validity |= NFS_INO_INVALID_CTIME - | NFS_INO_INVALID_MTIME - | cache_validity; + cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) { nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; nfsi->attrtimeo_timestamp = jiffies; } else { if (S_ISDIR(inode->i_mode)) { - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + cache_validity |= NFS_INO_INVALID_DATA; nfs_force_lookup_revalidate(inode); } else { if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; + cache_validity |= NFS_INO_REVAL_PAGECACHE; } if (cinfo->before != inode_peek_iversion_raw(inode)) - nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | - NFS_INO_INVALID_ACL | - NFS_INO_INVALID_XATTR; + cache_validity |= NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR; } inode_set_iversion_raw(inode, cinfo->after); nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); + nfs_set_cache_invalid(inode, cache_validity); nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE; - - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); } void @@ -5915,9 +5911,9 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl * so mark the attribute cache invalid. */ spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME - | NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); -- cgit v1.2.3 From bf1bc694b6b0cf49756cb06f8f38501b9b2c5527 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 8 Mar 2021 12:00:47 -0300 Subject: cifs: print MIDs in decimal notation The MIDs are mostly printed as decimal, so let's make it consistent. Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Aurelien Aptel Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 2 +- fs/cifs/connect.c | 4 ++-- fs/cifs/smb2misc.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 3aedc484e440..88a7958170ee 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -207,7 +207,7 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) from_kuid(&init_user_ns, cfile->uid), cfile->dentry); #ifdef CONFIG_CIFS_DEBUG2 - seq_printf(m, " 0x%llx\n", cfile->fid.mid); + seq_printf(m, " %llu\n", cfile->fid.mid); #else seq_printf(m, "\n"); #endif /* CIFS_DEBUG2 */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 68642e3d4270..eec8a2052da2 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -741,7 +741,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid); + cifs_dbg(FYI, "Clearing mid %llu\n", mid_entry->mid); kref_get(&mid_entry->refcount); mid_entry->mid_state = MID_SHUTDOWN; list_move(&mid_entry->qhead, &dispose_list); @@ -752,7 +752,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) /* now walk dispose list and issue callbacks */ list_for_each_safe(tmp, tmp2, &dispose_list) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid); + cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); cifs_mid_q_entry_release(mid_entry); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 60d4bd1eae2b..6e0ea19e710b 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -767,7 +767,7 @@ smb2_cancelled_close_fid(struct work_struct *work) int rc; if (cancelled->mid) - cifs_tcon_dbg(VFS, "Close unmatched open for MID:%llx\n", + cifs_tcon_dbg(VFS, "Close unmatched open for MID:%llu\n", cancelled->mid); else cifs_tcon_dbg(VFS, "Close interrupted close\n"); -- cgit v1.2.3 From e3d100eae44b42f309c1366efb8397368f1cf8ed Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 8 Mar 2021 12:00:48 -0300 Subject: cifs: change noisy error message to FYI A customer has reported that their dmesg were being flooded by CIFS: VFS: \\server Cancelling wait for mid xxx cmd: a CIFS: VFS: \\server Cancelling wait for mid yyy cmd: b CIFS: VFS: \\server Cancelling wait for mid zzz cmd: c because some processes that were performing statfs(2) on the share had been interrupted due to their automount setup when certain users logged in and out. Change it to FYI as they should be mostly informative rather than error messages. Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Aurelien Aptel Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index e90a1d1380b0..15b72d2e8713 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1207,7 +1207,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } if (rc != 0) { for (; i < num_rqst; i++) { - cifs_server_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n", + cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n", midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(server, &rqst[i], midQ[i]); spin_lock(&GlobalMid_Lock); -- cgit v1.2.3 From 14302ee3301b3a77b331cc14efb95bf7184c73cc Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 8 Mar 2021 12:00:49 -0300 Subject: cifs: return proper error code in statfs(2) In cifs_statfs(), if server->ops->queryfs is not NULL, then we should use its return value rather than always returning 0. Instead, use rc variable as it is properly set to 0 in case there is no server->ops->queryfs. Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Aurelien Aptel Reviewed-by: Ronnie Sahlberg CC: Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d43e935d2df4..099ad9f3660b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -290,7 +290,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) rc = server->ops->queryfs(xid, tcon, cifs_sb, buf); free_xid(xid); - return 0; + return rc; } static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) -- cgit v1.2.3 From 04ad69c342fc4de5bd23be9ef15ea7574fb1a87e Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 8 Mar 2021 12:00:50 -0300 Subject: cifs: do not send close in compound create+close requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case of interrupted syscalls, prevent sending CLOSE commands for compound CREATE+CLOSE requests by introducing an CIFS_CP_CREATE_CLOSE_OP flag to indicate lower layers that it should not send a CLOSE command to the MIDs corresponding the compound CREATE+CLOSE request. A simple reproducer: #!/bin/bash mount //server/share /mnt -o username=foo,password=*** tc qdisc add dev eth0 root netem delay 450ms stat -f /mnt &>/dev/null & pid=$! sleep 0.01 kill $pid tc qdisc del dev eth0 root umount /mnt Before patch: ... 6 0.256893470 192.168.122.2 → 192.168.122.15 SMB2 402 Create Request File: ;GetInfo Request FS_INFO/FileFsFullSizeInformation;Close Request 7 0.257144491 192.168.122.15 → 192.168.122.2 SMB2 498 Create Response File: ;GetInfo Response;Close Response 9 0.260798209 192.168.122.2 → 192.168.122.15 SMB2 146 Close Request File: 10 0.260841089 192.168.122.15 → 192.168.122.2 SMB2 130 Close Response, Error: STATUS_FILE_CLOSED Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Ronnie Sahlberg Reviewed-by: Aurelien Aptel CC: Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 19 ++++++++++--------- fs/cifs/smb2inode.c | 1 + fs/cifs/smb2misc.c | 8 ++++---- fs/cifs/smb2ops.c | 10 +++++----- fs/cifs/smb2proto.h | 3 +-- fs/cifs/transport.c | 2 +- 6 files changed, 22 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 3de3c5908a72..31fc8695abd6 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -257,7 +257,7 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int, struct TCP_Server_Info *); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); - int (*handle_cancelled_mid)(char *, struct TCP_Server_Info *); + int (*handle_cancelled_mid)(struct mid_q_entry *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache); @@ -1705,16 +1705,17 @@ static inline bool is_retryable_error(int error) #define CIFS_NO_RSP_BUF 0x040 /* no response buffer required */ /* Type of request operation */ -#define CIFS_ECHO_OP 0x080 /* echo request */ -#define CIFS_OBREAK_OP 0x0100 /* oplock break request */ -#define CIFS_NEG_OP 0x0200 /* negotiate request */ +#define CIFS_ECHO_OP 0x080 /* echo request */ +#define CIFS_OBREAK_OP 0x0100 /* oplock break request */ +#define CIFS_NEG_OP 0x0200 /* negotiate request */ +#define CIFS_CP_CREATE_CLOSE_OP 0x0400 /* compound create+close request */ /* Lower bitmask values are reserved by others below. */ -#define CIFS_SESS_OP 0x2000 /* session setup request */ -#define CIFS_OP_MASK 0x2380 /* mask request type */ +#define CIFS_SESS_OP 0x2000 /* session setup request */ +#define CIFS_OP_MASK 0x2780 /* mask request type */ -#define CIFS_HAS_CREDITS 0x0400 /* already has credits */ -#define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */ -#define CIFS_NO_SRV_RSP 0x1000 /* there is no server response */ +#define CIFS_HAS_CREDITS 0x0400 /* already has credits */ +#define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */ +#define CIFS_NO_SRV_RSP 0x1000 /* there is no server response */ /* Security Flags: indicate type of session setup needed */ #define CIFSSEC_MAY_SIGN 0x00001 diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1f900b81c34a..a718dc77e604 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -358,6 +358,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (cfile) goto after_close; /* Close */ + flags |= CIFS_CP_CREATE_CLOSE_OP; rqst[num_rqst].rq_iov = &vars->close_iov[0]; rqst[num_rqst].rq_nvec = 1; rc = SMB2_close_init(tcon, server, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 6e0ea19e710b..b50164e2c88d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -844,14 +844,14 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, } int -smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server) +smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer; - struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer; + struct smb2_sync_hdr *sync_hdr = mid->resp_buf; + struct smb2_create_rsp *rsp = mid->resp_buf; struct cifs_tcon *tcon; int rc; - if (sync_hdr->Command != SMB2_CREATE || + if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || sync_hdr->Command != SMB2_CREATE || sync_hdr->Status != STATUS_SUCCESS) return 0; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f5087295424c..9bae7e8deb09 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1195,7 +1195,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, struct TCP_Server_Info *server = cifs_pick_channel(ses); __le16 *utf16_path = NULL; int ea_name_len = strlen(ea_name); - int flags = 0; + int flags = CIFS_CP_CREATE_CLOSE_OP; int len; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -1573,7 +1573,7 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_query_info qi; struct smb_query_info __user *pqi; int rc = 0; - int flags = 0; + int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb2_query_info_rsp *qi_rsp = NULL; struct smb2_ioctl_rsp *io_rsp = NULL; void *buffer = NULL; @@ -2577,7 +2577,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, { struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = cifs_pick_channel(ses); - int flags = 0; + int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst rqst[3]; int resp_buftype[3]; struct kvec rsp_iov[3]; @@ -2975,7 +2975,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, unsigned int sub_offset; unsigned int print_len; unsigned int print_offset; - int flags = 0; + int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst rqst[3]; int resp_buftype[3]; struct kvec rsp_iov[3]; @@ -3157,7 +3157,7 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - int flags = 0; + int flags = CIFS_CP_CREATE_CLOSE_OP; struct smb_rqst rqst[3]; int resp_buftype[3]; struct kvec rsp_iov[3]; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 9565e27681a5..a2eb34a8d9c9 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -246,8 +246,7 @@ extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, __u64 volatile_fid); -extern int smb2_handle_cancelled_mid(char *buffer, - struct TCP_Server_Info *server); +extern int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server); void smb2_cancelled_close_fid(struct work_struct *work); extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 15b72d2e8713..007d99437c77 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -101,7 +101,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount) if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) && midEntry->mid_state == MID_RESPONSE_RECEIVED && server->ops->handle_cancelled_mid) - server->ops->handle_cancelled_mid(midEntry->resp_buf, server); + server->ops->handle_cancelled_mid(midEntry, server); midEntry->mid_state = MID_FREE; atomic_dec(&midCount); -- cgit v1.2.3 From 4aa5e002034f0701c3335379fd6c22d7f3338cce Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 8 Mar 2021 10:51:51 -0500 Subject: Revert "nfsd4: remove check_conflicting_opens warning" This reverts commit 50747dd5e47b "nfsd4: remove check_conflicting_opens warning", as a prerequisite for reverting 94415b06eb8a, which has a serious bug. Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 61552e89bd89..482e241c9639 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4952,6 +4952,7 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, writes--; if (fp->fi_fds[O_RDWR]) writes--; + WARN_ON_ONCE(writes < 0); if (writes > 0) return -EAGAIN; spin_lock(&fp->fi_lock); -- cgit v1.2.3 From 6ee65a773096ab3f39d9b00311ac983be5bdeb7c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 8 Mar 2021 10:52:29 -0500 Subject: Revert "nfsd4: a client's own opens needn't prevent delegations" This reverts commit 94415b06eb8aed13481646026dc995f04a3a534a. That commit claimed to allow a client to get a read delegation when it was the only writer. Actually it allowed a client to get a read delegation when *any* client has a write open! The main problem is that it's depending on nfs4_clnt_odstate structures that are actually only maintained for pnfs exports. This causes clients to miss writes performed by other clients, even when there have been intervening closes and opens, violating close-to-open cache consistency. We can do this a different way, but first we should just revert this. I've added pynfs 4.1 test DELEG19 to test for this, as I should have done originally! Cc: stable@vger.kernel.org Reported-by: Timo Rothenpieler Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/locks.c | 3 --- fs/nfsd/nfs4state.c | 54 ++++++++++++++--------------------------------------- 2 files changed, 14 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 99ca97e81b7a..6125d2de39b8 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1808,9 +1808,6 @@ check_conflicting_open(struct file *filp, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if (flags & FL_DELEG) - /* We leave these checks to the caller. */ - return 0; if (arg == F_RDLCK) return inode_is_open_for_write(inode) ? -EAGAIN : 0; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 482e241c9639..97447a64bad0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4940,32 +4940,6 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, return fl; } -static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, - struct nfs4_file *fp) -{ - struct nfs4_clnt_odstate *co; - struct file *f = fp->fi_deleg_file->nf_file; - struct inode *ino = locks_inode(f); - int writes = atomic_read(&ino->i_writecount); - - if (fp->fi_fds[O_WRONLY]) - writes--; - if (fp->fi_fds[O_RDWR]) - writes--; - WARN_ON_ONCE(writes < 0); - if (writes > 0) - return -EAGAIN; - spin_lock(&fp->fi_lock); - list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { - if (co->co_client != clp) { - spin_unlock(&fp->fi_lock); - return -EAGAIN; - } - } - spin_unlock(&fp->fi_lock); - return 0; -} - static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) @@ -4985,12 +4959,9 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, nf = find_readable_file(fp); if (!nf) { - /* - * We probably could attempt another open and get a read - * delegation, but for now, don't bother until the - * client actually sends us one. - */ - return ERR_PTR(-EAGAIN); + /* We should always have a readable file here */ + WARN_ON_ONCE(1); + return ERR_PTR(-EBADF); } spin_lock(&state_lock); spin_lock(&fp->fi_lock); @@ -5020,19 +4991,11 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (!fl) goto out_clnt_odstate; - status = nfsd4_check_conflicting_opens(clp, fp); - if (status) { - locks_free_lock(fl); - goto out_clnt_odstate; - } status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); if (fl) locks_free_lock(fl); if (status) goto out_clnt_odstate; - status = nfsd4_check_conflicting_opens(clp, fp); - if (status) - goto out_clnt_odstate; spin_lock(&state_lock); spin_lock(&fp->fi_lock); @@ -5114,6 +5077,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; + /* + * Also, if the file was opened for write or + * create, there's a good chance the client's + * about to write to it, resulting in an + * immediate recall (since we don't support + * write delegations): + */ + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + goto out_no_deleg; + if (open->op_create == NFS4_OPEN_CREATE) + goto out_no_deleg; break; default: goto out_no_deleg; -- cgit v1.2.3 From 5808fecc572391867fcd929662b29c12e6d08d81 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Tue, 9 Mar 2021 09:29:11 -0800 Subject: iomap: Fix negative assignment to unsigned sis->pages in iomap_swapfile_activate In case if isi.nr_pages is 0, we are making sis->pages (which is unsigned int) a huge value in iomap_swapfile_activate() by assigning -1. This could cause a kernel crash in kernel v4.18 (with below signature). Or could lead to unknown issues on latest kernel if the fake big swap gets used. Fix this issue by returning -EINVAL in case of nr_pages is 0, since it is anyway a invalid swapfile. Looks like this issue will be hit when we have pagesize < blocksize type of configuration. I was able to hit the issue in case of a tiny swap file with below test script. https://raw.githubusercontent.com/riteshharjani/LinuxStudy/master/scripts/swap-issue.sh kernel crash analysis on v4.18 ============================== On v4.18 kernel, it causes a kernel panic, since sis->pages becomes a huge value and isi.nr_extents is 0. When 0 is returned it is considered as a swapfile over NFS and SWP_FILE is set (sis->flags |= SWP_FILE). Then when swapoff was getting called it was calling a_ops->swap_deactivate() if (sis->flags & SWP_FILE) is true. Since a_ops->swap_deactivate() is NULL in case of XFS, it causes below panic. Panic signature on v4.18 kernel: ======================================= root@qemu:/home/qemu# [ 8291.723351] XFS (loop2): Unmounting Filesystem [ 8292.123104] XFS (loop2): Mounting V5 Filesystem [ 8292.132451] XFS (loop2): Ending clean mount [ 8292.263362] Adding 4294967232k swap on /mnt1/test/swapfile. Priority:-2 extents:1 across:274877906880k [ 8292.277834] Unable to handle kernel paging request for instruction fetch [ 8292.278677] Faulting instruction address: 0x00000000 cpu 0x19: Vector: 400 (Instruction Access) at [c0000009dd5b7ad0] pc: 0000000000000000 lr: c0000000003eb9dc: destroy_swap_extents+0xfc/0x120 sp: c0000009dd5b7d50 msr: 8000000040009033 current = 0xc0000009b6710080 paca = 0xc00000003ffcb280 irqmask: 0x03 irq_happened: 0x01 pid = 5604, comm = swapoff Linux version 4.18.0 (riteshh@xxxxxxx) (gcc version 8.4.0 (Ubuntu 8.4.0-1ubuntu1~18.04)) #57 SMP Wed Mar 3 01:33:04 CST 2021 enter ? for help [link register ] c0000000003eb9dc destroy_swap_extents+0xfc/0x120 [c0000009dd5b7d50] c0000000025a7058 proc_poll_event+0x0/0x4 (unreliable) [c0000009dd5b7da0] c0000000003f0498 sys_swapoff+0x3f8/0x910 [c0000009dd5b7e30] c00000000000bbe4 system_call+0x5c/0x70 Exception: c01 (System Call) at 00007ffff7d208d8 Signed-off-by: Ritesh Harjani [djwong: rework the comment to provide more details] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/iomap/swapfile.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index a648dbf6991e..a5e478de1417 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -170,6 +170,16 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return ret; } + /* + * If this swapfile doesn't contain even a single page-aligned + * contiguous range of blocks, reject this useless swapfile to + * prevent confusion later on. + */ + if (isi.nr_pages == 0) { + pr_warn("swapon: Cannot find a single usable page in file.\n"); + return -EINVAL; + } + *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; sis->max = isi.nr_pages; sis->pages = isi.nr_pages - 1; -- cgit v1.2.3 From b5a08423da9da59c7f38ed8dbb6dd6cbbe9024a4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Mar 2021 09:32:52 -0800 Subject: xfs: fix quota accounting when a mount is idmapped Nowadays, we indirectly use the idmap-aware helper functions in the VFS to set the initial uid and gid of a file being created. Unfortunately, we didn't convert the quota code, which means we attach the wrong dquots to files created on an idmapped mount. Fixes: f736d93d76d3 ("xfs: support idmapped mounts") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Acked-by: Christian Brauner Reviewed-by: Dave Chinner --- fs/xfs/xfs_inode.c | 14 ++++++++------ fs/xfs/xfs_symlink.c | 3 ++- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 46a861d55e48..f93370bd7b1e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1007,9 +1007,10 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, - &udqp, &gdqp, &pdqp); + error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns), + fsgid_into_mnt(mnt_userns), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); if (error) return error; @@ -1157,9 +1158,10 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, - &udqp, &gdqp, &pdqp); + error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns), + fsgid_into_mnt(mnt_userns), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); if (error) return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 1379013d74b8..7f368b10ded1 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -182,7 +182,8 @@ xfs_symlink( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, + error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns), + fsgid_into_mnt(mnt_userns), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) -- cgit v1.2.3 From 614c9750173e412663728215152cc6d12bcb3425 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 9 Mar 2021 09:41:14 -0500 Subject: NFSD: fix dest to src mount in inter-server COPY A cleanup of the inter SSC copy needs to call fput() of the source file handle to make sure that file structure is freed as well as drop the reference on the superblock to unmount the source server. Fixes: 36e1e5ba90fb ("NFSD: Fix use-after-free warning when doing inter-server copy") Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever Tested-by: Dai Ngo --- fs/nfsd/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index acdb3cd806a1..dd9f38d072dd 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1302,7 +1302,7 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src, struct nfsd_file *dst) { nfs42_ssc_close(src->nf_file); - /* 'src' is freed by nfsd4_do_async_copy */ + fput(src->nf_file); nfsd_file_put(dst); mntput(ss_mnt); } -- cgit v1.2.3 From 53cb245454df5b13d7063162afd7a785aed6ebf2 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 15 Jan 2021 18:43:56 +0100 Subject: NFSv4.2: fix return value of _nfs4_get_security_label() An xattr 'get' handler is expected to return the length of the value on success, yet _nfs4_get_security_label() (and consequently also nfs4_xattr_get_nfs4_label(), which is used as an xattr handler) returns just 0 on success. Fix this by returning label.len instead, which contains the length of the result. Fixes: aa9c2669626c ("NFS: Client implementation of Labeled-NFS") Signed-off-by: Ondrej Mosnacek Reviewed-by: James Morris Reviewed-by: Paul Moore Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6d8fc56e5f45..1002c4f66f3f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5965,7 +5965,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, return ret; if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) return -ENOENT; - return 0; + return label.len; } static int nfs4_get_security_label(struct inode *inode, void *buf, -- cgit v1.2.3 From 05962f95f9ac7af25fea037ef51b37c0eccb5590 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 6 Mar 2021 13:58:48 -0700 Subject: io_uring: SQPOLL parking fixes We keep running into weird dependency issues between the sqd lock and the parking state. Disentangle the SQPOLL thread from the last bits of the kthread parking inheritance, and just replace the parking state, and two associated locks, with a single rw mutex. The SQPOLL thread keeps the mutex for read all the time, except if someone has marked us needing to park. Then we drop/re-acquire and try again. This greatly simplifies the parking state machine (by just getting rid of it), and makes it a lot more obvious how it works - if you need to modify the ctx list, then you simply park the thread which will grab the lock for writing. Fold in fix from Hillf Danton on not setting STOP on a fatal signal. Fixes: e54945ae947f ("io_uring: SQPOLL stop error handling fixes") Signed-off-by: Jens Axboe --- fs/io_uring.c | 133 +++++++++++++++------------------------------------------- 1 file changed, 34 insertions(+), 99 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7cf96be691d8..2a3542b487ff 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -258,12 +258,11 @@ enum { struct io_sq_data { refcount_t refs; - struct mutex lock; + struct rw_semaphore rw_lock; /* ctx's that are using this sqd */ struct list_head ctx_list; struct list_head ctx_new_list; - struct mutex ctx_lock; struct task_struct *thread; struct wait_queue_head wait; @@ -274,7 +273,6 @@ struct io_sq_data { unsigned long state; struct completion startup; - struct completion parked; struct completion exited; }; @@ -6638,45 +6636,6 @@ static void io_sqd_init_new(struct io_sq_data *sqd) io_sqd_update_thread_idle(sqd); } -static bool io_sq_thread_should_stop(struct io_sq_data *sqd) -{ - return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); -} - -static bool io_sq_thread_should_park(struct io_sq_data *sqd) -{ - return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); -} - -static void io_sq_thread_parkme(struct io_sq_data *sqd) -{ - for (;;) { - /* - * TASK_PARKED is a special state; we must serialize against - * possible pending wakeups to avoid store-store collisions on - * task->state. - * - * Such a collision might possibly result in the task state - * changin from TASK_PARKED and us failing the - * wait_task_inactive() in kthread_park(). - */ - set_special_state(TASK_PARKED); - if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) - break; - - /* - * Thread is going to call schedule(), do not preempt it, - * or the caller of kthread_park() may spend more time in - * wait_task_inactive(). - */ - preempt_disable(); - complete(&sqd->parked); - schedule_preempt_disabled(); - preempt_enable(); - } - __set_current_state(TASK_RUNNING); -} - static int io_sq_thread(void *data) { struct io_sq_data *sqd = data; @@ -6697,17 +6656,16 @@ static int io_sq_thread(void *data) wait_for_completion(&sqd->startup); - while (!io_sq_thread_should_stop(sqd)) { + down_read(&sqd->rw_lock); + + while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { int ret; bool cap_entries, sqt_spin, needs_sched; - /* - * Any changes to the sqd lists are synchronized through the - * thread parking. This synchronizes the thread vs users, - * the users are synchronized on the sqd->ctx_lock. - */ - if (io_sq_thread_should_park(sqd)) { - io_sq_thread_parkme(sqd); + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { + up_read(&sqd->rw_lock); + cond_resched(); + down_read(&sqd->rw_lock); continue; } if (unlikely(!list_empty(&sqd->ctx_new_list))) { @@ -6752,12 +6710,14 @@ static int io_sq_thread(void *data) } } - if (needs_sched && !io_sq_thread_should_park(sqd)) { + if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); + up_read(&sqd->rw_lock); schedule(); try_to_freeze(); + down_read(&sqd->rw_lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } @@ -6768,28 +6728,16 @@ static int io_sq_thread(void *data) list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_uring_cancel_sqpoll(ctx); + up_read(&sqd->rw_lock); io_run_task_work(); - /* - * Ensure that we park properly if racing with someone trying to park - * while we're exiting. If we fail to grab the lock, check park and - * park if necessary. The ordering with the park bit and the lock - * ensures that we catch this reliably. - */ - if (!mutex_trylock(&sqd->lock)) { - if (io_sq_thread_should_park(sqd)) - io_sq_thread_parkme(sqd); - mutex_lock(&sqd->lock); - } - + down_write(&sqd->rw_lock); sqd->thread = NULL; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); - } - + up_write(&sqd->rw_lock); complete(&sqd->exited); - mutex_unlock(&sqd->lock); do_exit(0); } @@ -7088,44 +7036,40 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) } static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) + __releases(&sqd->rw_lock) { if (sqd->thread == current) return; clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - if (sqd->thread) - wake_up_state(sqd->thread, TASK_PARKED); - mutex_unlock(&sqd->lock); + up_write(&sqd->rw_lock); } static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) + __acquires(&sqd->rw_lock) { if (sqd->thread == current) return; set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) { + down_write(&sqd->rw_lock); + /* set again for consistency, in case concurrent parks are happening */ + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + if (sqd->thread) wake_up_process(sqd->thread); - wait_for_completion(&sqd->parked); - } } static void io_sq_thread_stop(struct io_sq_data *sqd) { if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) return; - mutex_lock(&sqd->lock); - if (sqd->thread) { - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)); - wake_up_process(sqd->thread); - mutex_unlock(&sqd->lock); - wait_for_completion(&sqd->exited); - WARN_ON_ONCE(sqd->thread); - } else { - mutex_unlock(&sqd->lock); + down_write(&sqd->rw_lock); + if (!sqd->thread) { + up_write(&sqd->rw_lock); + return; } + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + wake_up_process(sqd->thread); + up_write(&sqd->rw_lock); + wait_for_completion(&sqd->exited); } static void io_put_sq_data(struct io_sq_data *sqd) @@ -7142,18 +7086,13 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) if (sqd) { complete(&sqd->startup); - if (sqd->thread) { + if (sqd->thread) wait_for_completion(&ctx->sq_thread_comp); - io_sq_thread_park(sqd); - } - mutex_lock(&sqd->ctx_lock); + io_sq_thread_park(sqd); list_del(&ctx->sqd_list); io_sqd_update_thread_idle(sqd); - mutex_unlock(&sqd->ctx_lock); - - if (sqd->thread) - io_sq_thread_unpark(sqd); + io_sq_thread_unpark(sqd); io_put_sq_data(sqd); ctx->sq_data = NULL; @@ -7202,11 +7141,9 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); INIT_LIST_HEAD(&sqd->ctx_new_list); - mutex_init(&sqd->ctx_lock); - mutex_init(&sqd->lock); + init_rwsem(&sqd->rw_lock); init_waitqueue_head(&sqd->wait); init_completion(&sqd->startup); - init_completion(&sqd->parked); init_completion(&sqd->exited); return sqd; } @@ -7880,9 +7817,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ctx->sq_creds = get_current_cred(); ctx->sq_data = sqd; io_sq_thread_park(sqd); - mutex_lock(&sqd->ctx_lock); list_add(&ctx->sqd_list, &sqd->ctx_new_list); - mutex_unlock(&sqd->ctx_lock); io_sq_thread_unpark(sqd); ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); -- cgit v1.2.3 From f458dd8441e56d122ddf1d8e2af0b6ee62f52af9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Mar 2021 12:14:14 +0000 Subject: io_uring: fix unrelated ctx reqs cancellation io-wq now is per-task, so cancellations now should match against request's ctx. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a3542b487ff..d4f018f5838d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5573,22 +5573,30 @@ add: return 0; } +struct io_cancel_data { + struct io_ring_ctx *ctx; + u64 user_data; +}; + static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_cancel_data *cd = data; - return req->user_data == (unsigned long) data; + return req->ctx == cd->ctx && req->user_data == cd->user_data; } -static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr) +static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, + struct io_ring_ctx *ctx) { + struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; enum io_wq_cancel cancel_ret; int ret = 0; - if (!tctx->io_wq) + if (!tctx || !tctx->io_wq) return -ENOENT; - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false); + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -5611,8 +5619,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, unsigned long flags; int ret; - ret = io_async_cancel_one(req->task->io_uring, - (void *) (unsigned long) sqe_addr); + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); if (ret != -ENOENT) { spin_lock_irqsave(&ctx->completion_lock, flags); goto done; -- cgit v1.2.3 From 0298ef969a110ca03654f0cea9b50e3f3b331acc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Mar 2021 13:20:57 +0000 Subject: io_uring: clean R_DISABLED startup mess There are enough of problems with IORING_SETUP_R_DISABLED, including the burden of checking and kicking off the SQO task all over the codebase -- for exit/cancel/etc. Rework it, always start the thread but don't do submit unless the flag is gone, that's much easier. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index d4f018f5838d..3f6db813d670 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6606,7 +6606,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, &nr_events, 0); - if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } @@ -7861,6 +7862,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, wake_up_new_task(tsk); if (ret) goto err; + complete(&sqd->startup); } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ ret = -EINVAL; @@ -7873,15 +7875,6 @@ err: return ret; } -static void io_sq_offload_start(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - ctx->flags &= ~IORING_SETUP_R_DISABLED; - if (ctx->flags & IORING_SETUP_SQPOLL) - complete(&sqd->startup); -} - static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -8742,11 +8735,6 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct task_struct *task = current; if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { - /* never started, nothing to cancel */ - if (ctx->flags & IORING_SETUP_R_DISABLED) { - io_sq_offload_start(ctx); - return; - } io_sq_thread_park(ctx->sq_data); task = ctx->sq_data->thread; if (task) @@ -9449,9 +9437,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - if (!(p->flags & IORING_SETUP_R_DISABLED)) - io_sq_offload_start(ctx); - memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -9668,7 +9653,9 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (ctx->restrictions.registered) ctx->restricted = 1; - io_sq_offload_start(ctx); + ctx->flags &= ~IORING_SETUP_R_DISABLED; + if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); return 0; } -- cgit v1.2.3 From 61cf93700fe6359552848ed5e3becba6cd760efa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Mar 2021 14:16:16 +0000 Subject: io_uring: Convert personality_idr to XArray You can't call idr_remove() from within a idr_for_each() callback, but you can call xa_erase() from an xa_for_each() loop, so switch the entire personality_idr from the IDR to the XArray. This manifests as a use-after-free as idr_for_each() attempts to walk the rest of the node after removing the last entry from it. Fixes: 071698e13ac6 ("io_uring: allow registering credentials") Cc: stable@vger.kernel.org # 5.6+ Reported-by: yangerkun Signed-off-by: Matthew Wilcox (Oracle) [Pavel: rebased (creds load was moved into io_init_req())] Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7ccff36e1375f2b0ebf73d957f037b43becc0dde.1615212806.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3f6db813d670..84eb499368a4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -406,7 +406,8 @@ struct io_ring_ctx { struct idr io_buffer_idr; - struct idr personality_idr; + struct xarray personalities; + u32 pers_next; struct { unsigned cached_cq_tail; @@ -1137,7 +1138,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_completion(&ctx->ref_comp); init_completion(&ctx->sq_thread_comp); idr_init(&ctx->io_buffer_idr); - idr_init(&ctx->personality_idr); + xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); @@ -6337,7 +6338,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->work.list.next = NULL; personality = READ_ONCE(sqe->personality); if (personality) { - req->work.creds = idr_find(&ctx->personality_idr, personality); + req->work.creds = xa_load(&ctx->personalities, personality); if (!req->work.creds) return -EINVAL; get_cred(req->work.creds); @@ -8355,7 +8356,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); - idr_destroy(&ctx->personality_idr); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { @@ -8420,7 +8420,7 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { const struct cred *creds; - creds = idr_remove(&ctx->personality_idr, id); + creds = xa_erase(&ctx->personalities, id); if (creds) { put_cred(creds); return 0; @@ -8429,14 +8429,6 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } -static int io_remove_personalities(int id, void *p, void *data) -{ - struct io_ring_ctx *ctx = data; - - io_unregister_personality(ctx, id); - return 0; -} - static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) { struct callback_head *work, *next; @@ -8526,13 +8518,17 @@ static void io_ring_exit_work(struct work_struct *work) static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { + unsigned long index; + struct creds *creds; + mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); /* if force is set, the ring is going away. always drop after that */ ctx->cq_overflow_flushed = 1; if (ctx->rings) __io_cqring_overflow_flush(ctx, true, NULL, NULL); - idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); + xa_for_each(&ctx->personalities, index, creds) + io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); io_kill_timeouts(ctx, NULL, NULL); @@ -9162,10 +9158,9 @@ out_fput: } #ifdef CONFIG_PROC_FS -static int io_uring_show_cred(int id, void *p, void *data) +static int io_uring_show_cred(struct seq_file *m, unsigned int id, + const struct cred *cred) { - const struct cred *cred = p; - struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; kernel_cap_t cap; @@ -9233,9 +9228,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, (unsigned int) buf->len); } - if (has_lock && !idr_is_empty(&ctx->personality_idr)) { + if (has_lock && !xa_empty(&ctx->personalities)) { + unsigned long index; + const struct cred *cred; + seq_printf(m, "Personalities:\n"); - idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); + xa_for_each(&ctx->personalities, index, cred) + io_uring_show_cred(m, index, cred); } seq_printf(m, "PollList:\n"); spin_lock_irq(&ctx->completion_lock); @@ -9564,14 +9563,16 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { const struct cred *creds; + u32 id; int ret; creds = get_current_cred(); - ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, - USHRT_MAX, GFP_KERNEL); - if (ret < 0) - put_cred(creds); + ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, + XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); + if (!ret) + return id; + put_cred(creds); return ret; } -- cgit v1.2.3 From cc20e3fec682700b673fcd286e6bef8e9da947e2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Mar 2021 09:34:43 -0700 Subject: io-wq: remove unused 'user' member of io_wq Previous patches killed the last user of this, now it's just a dead member in the struct. Get rid of it. Signed-off-by: Jens Axboe --- fs/io-wq.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 1ab9324e602f..c2e7031f6d09 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -110,7 +110,6 @@ struct io_wq { io_wq_work_fn *do_work; struct task_struct *manager; - struct user_struct *user; struct io_wq_hash *hash; -- cgit v1.2.3 From 97a73a0f9fbfb2be682fd037814576dbfa0e0da8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Mar 2021 17:30:54 +0000 Subject: io_uring: fix io_sq_offload_create error handling Don't set IO_SQ_THREAD_SHOULD_STOP when io_sq_offload_create() has failed on io_uring_alloc_task_context() but leave everything to io_sq_thread_finish(), because currently io_sq_thread_finish() hangs on trying to park it. That's great it stalls there, because otherwise the following io_sq_thread_stop() would be skipped on IO_SQ_THREAD_SHOULD_STOP check and the sqo would race for sqd with freeing ctx. A simple error injection gives something like this. [ 245.463955] INFO: task sqpoll-test-hang:523 blocked for more than 122 seconds. [ 245.463983] Call Trace: [ 245.463990] __schedule+0x36b/0x950 [ 245.464005] schedule+0x68/0xe0 [ 245.464013] schedule_timeout+0x209/0x2a0 [ 245.464032] wait_for_completion+0x8b/0xf0 [ 245.464043] io_sq_thread_finish+0x44/0x1a0 [ 245.464049] io_uring_setup+0x9ea/0xc80 [ 245.464058] __x64_sys_io_uring_setup+0x16/0x20 [ 245.464064] do_syscall_64+0x38/0x50 [ 245.464073] entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 84eb499368a4..3299807894ec 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7856,10 +7856,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ret = PTR_ERR(tsk); goto err; } - ret = io_uring_alloc_task_context(tsk, ctx); - if (ret) - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + sqd->thread = tsk; + ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) goto err; -- cgit v1.2.3 From 33cc89a9fc248a486857381584cc6b67d9405fab Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 9 Mar 2021 00:37:58 +0000 Subject: io_uring: add io_disarm_next() helper A preparation patch placing all preparations before extracting a next request into a separate helper io_disarm_next(). Also, don't spuriously do ev_posted in a rare case where REQ_F_FAIL_LINK is set but there are no requests linked (i.e. after cancelling a linked timeout or setting IOSQE_IO_LINK on a last request of a submission batch). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/44ecff68d6b47e1c4e6b891bdde1ddc08cfc3590.1615250156.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 68 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3299807894ec..cc9a2cc95608 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1705,15 +1705,11 @@ static inline void io_remove_next_linked(struct io_kiocb *req) nxt->link = NULL; } -static void io_kill_linked_timeout(struct io_kiocb *req) +static bool io_kill_linked_timeout(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) { - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; + struct io_kiocb *link = req->link; bool cancelled = false; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - link = req->link; /* * Can happen if a linked timeout fired and link had been like @@ -1728,50 +1724,48 @@ static void io_kill_linked_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { io_cqring_fill_event(link, -ECANCELED); - io_commit_cqring(ctx); + io_put_req_deferred(link, 1); cancelled = true; } } req->flags &= ~REQ_F_LINK_TIMEOUT; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - - if (cancelled) { - io_cqring_ev_posted(ctx); - io_put_req(link); - } + return cancelled; } - static void io_fail_links(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) { - struct io_kiocb *link, *nxt; - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; + struct io_kiocb *nxt, *link = req->link; - spin_lock_irqsave(&ctx->completion_lock, flags); - link = req->link; req->link = NULL; - while (link) { nxt = link->link; link->link = NULL; trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); - io_put_req_deferred(link, 2); link = nxt; } - io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); +} - io_cqring_ev_posted(ctx); +static bool io_disarm_next(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + bool posted = false; + + if (likely(req->flags & REQ_F_LINK_TIMEOUT)) + posted = io_kill_linked_timeout(req); + if (unlikely(req->flags & REQ_F_FAIL_LINK)) { + posted |= (req->link != NULL); + io_fail_links(req); + } + return posted; } static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) { - if (req->flags & REQ_F_LINK_TIMEOUT) - io_kill_linked_timeout(req); + struct io_kiocb *nxt; /* * If LINK is set, we have dependent requests in this chain. If we @@ -1779,14 +1773,22 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (likely(!(req->flags & REQ_F_FAIL_LINK))) { - struct io_kiocb *nxt = req->link; + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + bool posted; - req->link = NULL; - return nxt; + spin_lock_irqsave(&ctx->completion_lock, flags); + posted = io_disarm_next(req); + if (posted) + io_commit_cqring(req->ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (posted) + io_cqring_ev_posted(ctx); } - io_fail_links(req); - return NULL; + nxt = req->link; + req->link = NULL; + return nxt; } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) -- cgit v1.2.3 From 7a612350a989866510dc5c874fd8ffe1f37555d2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 9 Mar 2021 00:37:59 +0000 Subject: io_uring: fix complete_post races for linked req Calling io_queue_next() after spin_unlock in io_req_complete_post() races with the other side extracting and reusing this request. Hand coded parts of io_req_find_next() considering that io_disarm_next() and io_req_task_queue() have (and safe) to be called with completion_lock held. It already does io_commit_cqring() and io_cqring_ev_posted(), so just reuse it for post io_disarm_next(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5672a62f3150ee7c55849f40c0037655c4f2840f.1615250156.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index cc9a2cc95608..f7153483a3ac 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -985,6 +985,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_UNLINKAT] = {}, }; +static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_task_file(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, @@ -1525,15 +1526,14 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) __io_cqring_fill_event(req, res, 0); } -static inline void io_req_complete_post(struct io_kiocb *req, long res, - unsigned int cflags) +static void io_req_complete_post(struct io_kiocb *req, long res, + unsigned int cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); __io_cqring_fill_event(req, res, cflags); - io_commit_cqring(ctx); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -1541,19 +1541,26 @@ static inline void io_req_complete_post(struct io_kiocb *req, long res, if (refcount_dec_and_test(&req->refs)) { struct io_comp_state *cs = &ctx->submit_state.comp; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) + io_disarm_next(req); + if (req->link) { + io_req_task_queue(req->link); + req->link = NULL; + } + } io_dismantle_req(req); io_put_task(req->task, 1); list_add(&req->compl.list, &cs->locked_free_list); cs->locked_free_nr++; } else req = NULL; + io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); - if (req) { - io_queue_next(req); + + if (req) percpu_ref_put(&ctx->refs); - } } static void io_req_complete_state(struct io_kiocb *req, long res, -- cgit v1.2.3 From 70e35125093b05b0e607ba1f5358ddf76946756c Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 9 Mar 2021 11:04:10 +0800 Subject: io-wq: fix ref leak for req in case of exit cancelations do_work such as io_wq_submit_work that cancel the work may leave a ref of req as 1 if we have links. Fix it by call io_run_cancel. Fixes: 4fb6ac326204 ("io-wq: improve manager/worker handling over exec") Signed-off-by: yangerkun Link: https://lore.kernel.org/r/20210309030410.3294078-1-yangerkun@huawei.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index c2e7031f6d09..3d7060ba547a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -799,8 +799,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) /* Can only happen if manager creation fails after exec */ if (io_wq_fork_manager(wqe->wq) || test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) { - work->flags |= IO_WQ_WORK_CANCEL; - wqe->wq->do_work(work); + io_run_cancel(work, wqe); return; } -- cgit v1.2.3 From 93e68e036c2fc1ce18e784418e4e19975a5882b4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Mar 2021 07:02:21 -0700 Subject: io_uring: move all io_kiocb init early in io_init_req() If we hit an error path in the function, make sure that the io_kiocb is fully initialized at that point so that freeing the request always sees a valid state. Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index f7153483a3ac..0f18e4a7bd08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6327,6 +6327,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = current; req->result = 0; + req->work.list.next = NULL; + req->work.creds = NULL; + req->work.flags = 0; /* enforce forwards compatibility on users */ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { @@ -6344,17 +6347,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; - req->work.list.next = NULL; personality = READ_ONCE(sqe->personality); if (personality) { req->work.creds = xa_load(&ctx->personalities, personality); if (!req->work.creds) return -EINVAL; get_cred(req->work.creds); - } else { - req->work.creds = NULL; } - req->work.flags = 0; state = &ctx->submit_state; /* -- cgit v1.2.3 From 5199328a0d415b3e372633096b1b92f36b8ac9e5 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 9 Mar 2021 14:30:41 +0800 Subject: io_uring: remove unneeded variable 'ret' Fix the following coccicheck warning: ./fs/io_uring.c:8984:5-8: Unneeded variable: "ret". Return "0" on line 8998 Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/1615271441-33649-1-git-send-email-yang.lee@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0f18e4a7bd08..6325f32ef6a3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9022,7 +9022,6 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) { - int ret = 0; DEFINE_WAIT(wait); do { @@ -9036,7 +9035,7 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) } while (!signal_pending(current)); finish_wait(&ctx->sqo_sq_wait, &wait); - return ret; + return 0; } static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, -- cgit v1.2.3 From e8f98f24549d62cc54bf608c815904a56d4437bc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Mar 2021 16:32:13 -0700 Subject: io_uring: always wait for sqd exited when stopping SQPOLL thread We have a tiny race where io_put_sq_data() calls io_sq_thead_stop() and finds the thread gone, but the thread has indeed not fully exited or called complete() yet. Close it up by always having io_sq_thread_stop() wait on completion of the exit event. Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6325f32ef6a3..62f998bf2ce8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7079,12 +7079,9 @@ static void io_sq_thread_stop(struct io_sq_data *sqd) if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) return; down_write(&sqd->rw_lock); - if (!sqd->thread) { - up_write(&sqd->rw_lock); - return; - } set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - wake_up_process(sqd->thread); + if (sqd->thread) + wake_up_process(sqd->thread); up_write(&sqd->rw_lock); wait_for_completion(&sqd->exited); } @@ -7849,9 +7846,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ret = -EINVAL; if (cpu >= nr_cpu_ids) - goto err; + goto err_sqpoll; if (!cpu_online(cpu)) - goto err; + goto err_sqpoll; sqd->sq_cpu = cpu; } else { @@ -7862,7 +7859,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); if (IS_ERR(tsk)) { ret = PTR_ERR(tsk); - goto err; + goto err_sqpoll; } sqd->thread = tsk; @@ -7881,6 +7878,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, err: io_sq_thread_finish(ctx); return ret; +err_sqpoll: + complete(&ctx->sq_data->exited); + goto err; } static inline void __io_unaccount_mem(struct user_struct *user, -- cgit v1.2.3 From e22bc9b481a90d7898984ea17621f04a653e2cd1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Mar 2021 19:49:02 -0700 Subject: kernel: make IO threads unfreezable by default The io-wq threads were already marked as no-freeze, but the manager was not. On resume, we perpetually have signal_pending() being true, and hence the manager will loop and spin 100% of the time. Just mark the tasks created by create_io_thread() as PF_NOFREEZE by default, and remove any knowledge of it in io-wq and io_uring. Reported-by: Kevin Locke Tested-by: Kevin Locke Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 +-- fs/io_uring.c | 1 - kernel/fork.c | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 3d7060ba547a..0ae9ecadf295 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -591,7 +591,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) tsk->pf_io_worker = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node)); - tsk->flags |= PF_NOFREEZE | PF_NO_SETAFFINITY; + tsk->flags |= PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); @@ -709,7 +709,6 @@ static int io_wq_manager(void *data) set_current_state(TASK_INTERRUPTIBLE); io_wq_check_workers(wq); schedule_timeout(HZ); - try_to_freeze(); if (fatal_signal_pending(current)) set_bit(IO_WQ_BIT_EXIT, &wq->state); } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); diff --git a/fs/io_uring.c b/fs/io_uring.c index 62f998bf2ce8..14165e18020c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6733,7 +6733,6 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); schedule(); - try_to_freeze(); down_read(&sqd->rw_lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); diff --git a/kernel/fork.c b/kernel/fork.c index d3171e8e88e5..72e444cd0ffe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2436,6 +2436,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) if (!IS_ERR(tsk)) { sigfillset(&tsk->blocked); sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + tsk->flags |= PF_NOFREEZE; } return tsk; } -- cgit v1.2.3 From 78d7f6ba82edb7f8763390982be29051c4216772 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 10 Mar 2021 13:13:53 +0000 Subject: io_uring: fix invalid ctx->sq_thread_idle We have to set ctx->sq_thread_idle before adding a ring to an SQ task, otherwise sqd races for seeing zero and accounting it as such. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 14165e18020c..7072c0eb22c1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7829,14 +7829,14 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ctx->sq_creds = get_current_cred(); ctx->sq_data = sqd; - io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_new_list); - io_sq_thread_unpark(sqd); - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; + io_sq_thread_park(sqd); + list_add(&ctx->sqd_list, &sqd->ctx_new_list); + io_sq_thread_unpark(sqd); + if (sqd->thread) return 0; -- cgit v1.2.3 From 7d41e8543d809c3c900d1212d6ea887eb284b69a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 10 Mar 2021 13:13:54 +0000 Subject: io_uring: remove indirect ctx into sqo injection We use ->ctx_new_list to notify sqo about new ctx pending, then sqo should stop and splice it to its sqd->ctx_list, paired with ->sq_thread_comp. The last one is broken because nobody reinitialises it, and trying to fix it would only add more complexity and bugs. And the first isn't really needed as is done under park(), that protects from races well. Add ctx into sqd->ctx_list directly (under park()), it's much simpler and allows to kill both, ctx_new_list and sq_thread_comp. note: apparently there is no real problem at the moment, because sq_thread_comp is used only by io_sq_thread_finish() followed by parking, where list_del(&ctx->sqd_list) removes it well regardless whether it's in the new or the active list. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7072c0eb22c1..5c045a9f7ffe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -262,7 +262,6 @@ struct io_sq_data { /* ctx's that are using this sqd */ struct list_head ctx_list; - struct list_head ctx_new_list; struct task_struct *thread; struct wait_queue_head wait; @@ -398,7 +397,6 @@ struct io_ring_ctx { struct user_struct *user; struct completion ref_comp; - struct completion sq_thread_comp; #if defined(CONFIG_UNIX) struct socket *ring_sock; @@ -1137,7 +1135,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); - init_completion(&ctx->sq_thread_comp); idr_init(&ctx->io_buffer_idr); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -6640,19 +6637,6 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd) sqd->sq_thread_idle = sq_thread_idle; } -static void io_sqd_init_new(struct io_sq_data *sqd) -{ - struct io_ring_ctx *ctx; - - while (!list_empty(&sqd->ctx_new_list)) { - ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); - list_move_tail(&ctx->sqd_list, &sqd->ctx_list); - complete(&ctx->sq_thread_comp); - } - - io_sqd_update_thread_idle(sqd); -} - static int io_sq_thread(void *data) { struct io_sq_data *sqd = data; @@ -6683,11 +6667,8 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); cond_resched(); down_read(&sqd->rw_lock); - continue; - } - if (unlikely(!list_empty(&sqd->ctx_new_list))) { - io_sqd_init_new(sqd); timeout = jiffies + sqd->sq_thread_idle; + continue; } if (fatal_signal_pending(current)) break; @@ -7099,9 +7080,6 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) if (sqd) { complete(&sqd->startup); - if (sqd->thread) - wait_for_completion(&ctx->sq_thread_comp); - io_sq_thread_park(sqd); list_del(&ctx->sqd_list); io_sqd_update_thread_idle(sqd); @@ -7153,7 +7131,6 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); - INIT_LIST_HEAD(&sqd->ctx_new_list); init_rwsem(&sqd->rw_lock); init_waitqueue_head(&sqd->wait); init_completion(&sqd->startup); @@ -7834,7 +7811,8 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ctx->sq_thread_idle = HZ; io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_new_list); + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); io_sq_thread_unpark(sqd); if (sqd->thread) -- cgit v1.2.3 From 14fbbc8297728e880070f7b077b3301a8c698ef9 Mon Sep 17 00:00:00 2001 From: Daiyue Zhang Date: Mon, 1 Mar 2021 14:10:53 +0800 Subject: configfs: fix a use-after-free in __configfs_open_file Commit b0841eefd969 ("configfs: provide exclusion between IO and removals") uses ->frag_dead to mark the fragment state, thus no bothering with extra refcount on config_item when opening a file. The configfs_get_config_item was removed in __configfs_open_file, but not with config_item_put. So the refcount on config_item will lost its balance, causing use-after-free issues in some occasions like this: Test: 1. Mount configfs on /config with read-only items: drwxrwx--- 289 root root 0 2021-04-01 11:55 /config drwxr-xr-x 2 root root 0 2021-04-01 11:54 /config/a --w--w--w- 1 root root 4096 2021-04-01 11:53 /config/a/1.txt ...... 2. Then run: for file in /config do echo $file grep -R 'key' $file done 3. __configfs_open_file will be called in parallel, the first one got called will do: if (file->f_mode & FMODE_READ) { if (!(inode->i_mode & S_IRUGO)) goto out_put_module; config_item_put(buffer->item); kref_put() package_details_release() kfree() the other one will run into use-after-free issues like this: BUG: KASAN: use-after-free in __configfs_open_file+0x1bc/0x3b0 Read of size 8 at addr fffffff155f02480 by task grep/13096 CPU: 0 PID: 13096 Comm: grep VIP: 00 Tainted: G W 4.14.116-kasan #1 TGID: 13096 Comm: grep Call trace: dump_stack+0x118/0x160 kasan_report+0x22c/0x294 __asan_load8+0x80/0x88 __configfs_open_file+0x1bc/0x3b0 configfs_open_file+0x28/0x34 do_dentry_open+0x2cc/0x5c0 vfs_open+0x80/0xe0 path_openat+0xd8c/0x2988 do_filp_open+0x1c4/0x2fc do_sys_open+0x23c/0x404 SyS_openat+0x38/0x48 Allocated by task 2138: kasan_kmalloc+0xe0/0x1ac kmem_cache_alloc_trace+0x334/0x394 packages_make_item+0x4c/0x180 configfs_mkdir+0x358/0x740 vfs_mkdir2+0x1bc/0x2e8 SyS_mkdirat+0x154/0x23c el0_svc_naked+0x34/0x38 Freed by task 13096: kasan_slab_free+0xb8/0x194 kfree+0x13c/0x910 package_details_release+0x524/0x56c kref_put+0xc4/0x104 config_item_put+0x24/0x34 __configfs_open_file+0x35c/0x3b0 configfs_open_file+0x28/0x34 do_dentry_open+0x2cc/0x5c0 vfs_open+0x80/0xe0 path_openat+0xd8c/0x2988 do_filp_open+0x1c4/0x2fc do_sys_open+0x23c/0x404 SyS_openat+0x38/0x48 el0_svc_naked+0x34/0x38 To fix this issue, remove the config_item_put in __configfs_open_file to balance the refcount of config_item. Fixes: b0841eefd969 ("configfs: provide exclusion between IO and removals") Signed-off-by: Daiyue Zhang Signed-off-by: Yi Chen Signed-off-by: Ge Qiu Reviewed-by: Chao Yu Acked-by: Al Viro Signed-off-by: Christoph Hellwig --- fs/configfs/file.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 1f0270229d7b..da8351d1e455 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -378,7 +378,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type attr = to_attr(dentry); if (!attr) - goto out_put_item; + goto out_free_buffer; if (type & CONFIGFS_ITEM_BIN_ATTR) { buffer->bin_attr = to_bin_attr(dentry); @@ -391,7 +391,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type /* Grab the module reference for this attribute if we have one */ error = -ENODEV; if (!try_module_get(buffer->owner)) - goto out_put_item; + goto out_free_buffer; error = -EACCES; if (!buffer->item->ci_type) @@ -435,8 +435,6 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type out_put_module: module_put(buffer->owner); -out_put_item: - config_item_put(buffer->item); out_free_buffer: up_read(&frag->frag_sem); kfree(buffer); -- cgit v1.2.3 From a8affc03a9b375e19bc81573de0c9108317d78c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Mar 2021 12:01:37 +0100 Subject: block: rename BIO_MAX_PAGES to BIO_MAX_VECS Ever since the addition of multipage bio_vecs BIO_MAX_PAGES has been horribly confusingly misnamed. Rename it to BIO_MAX_VECS to stop confusing users of the bio API. Signed-off-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210311110137.1132391-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 14 +++++++------- block/blk-crypto-fallback.c | 2 +- block/blk-lib.c | 2 +- block/blk-map.c | 2 +- block/bounce.c | 6 +++--- drivers/block/drbd/drbd_int.h | 2 +- drivers/md/bcache/super.c | 2 +- drivers/md/dm-crypt.c | 8 ++++---- drivers/md/dm-writecache.c | 4 ++-- drivers/md/raid5-cache.c | 4 ++-- drivers/md/raid5-ppl.c | 2 +- drivers/nvme/target/passthru.c | 6 +++--- fs/block_dev.c | 6 +++--- fs/btrfs/extent_io.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/crypto/bio.c | 6 +++--- fs/erofs/zdata.c | 2 +- fs/ext4/page-io.c | 2 +- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 4 ++-- fs/f2fs/segment.c | 2 +- fs/f2fs/segment.h | 4 ++-- fs/f2fs/super.c | 4 ++-- fs/gfs2/lops.c | 2 +- fs/iomap/buffered-io.c | 4 ++-- fs/iomap/direct-io.c | 4 ++-- fs/mpage.c | 2 +- fs/nilfs2/segbuf.c | 2 +- fs/squashfs/block.c | 2 +- fs/zonefs/super.c | 2 +- include/linux/bio.h | 4 ++-- 31 files changed, 56 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/block/bio.c b/block/bio.c index a1c4d2900c7a..26b7f721cda8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -33,7 +33,7 @@ static struct biovec_slab { { .nr_vecs = 16, .name = "biovec-16" }, { .nr_vecs = 64, .name = "biovec-64" }, { .nr_vecs = 128, .name = "biovec-128" }, - { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, + { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, }; static struct biovec_slab *biovec_slab(unsigned short nr_vecs) @@ -46,7 +46,7 @@ static struct biovec_slab *biovec_slab(unsigned short nr_vecs) return &bvec_slabs[1]; case 65 ... 128: return &bvec_slabs[2]; - case 129 ... BIO_MAX_PAGES: + case 129 ... BIO_MAX_VECS: return &bvec_slabs[3]; default: BUG(); @@ -151,9 +151,9 @@ out: void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES); + BIO_BUG_ON(nr_vecs > BIO_MAX_VECS); - if (nr_vecs == BIO_MAX_PAGES) + if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); else if (nr_vecs > BIO_INLINE_VECS) kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); @@ -186,15 +186,15 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. - * The mempool is sized to handle up to BIO_MAX_PAGES entries. + * The mempool is sized to handle up to BIO_MAX_VECS entries. */ - if (*nr_vecs < BIO_MAX_PAGES) { + if (*nr_vecs < BIO_MAX_VECS) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; - *nr_vecs = BIO_MAX_PAGES; + *nr_vecs = BIO_MAX_VECS; } return mempool_alloc(pool, gfp_mask); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c176b7af56a7..c322176a1e09 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -219,7 +219,7 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) bio_for_each_segment(bv, bio, iter) { num_sectors += bv.bv_len >> SECTOR_SHIFT; - if (++i == BIO_MAX_PAGES) + if (++i == BIO_MAX_VECS) break; } if (num_sectors < bio_sectors(bio)) { diff --git a/block/blk-lib.c b/block/blk-lib.c index 752f9c722062..7b256131b20b 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -296,7 +296,7 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) { sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); - return min(pages, (sector_t)BIO_MAX_PAGES); + return min(pages, (sector_t)BIO_MAX_VECS); } static int __blkdev_issue_zero_pages(struct block_device *bdev, diff --git a/block/blk-map.c b/block/blk-map.c index 369e204d14d0..1ffef782fcf2 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -249,7 +249,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); + bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_VECS)); if (!bio) return -ENOMEM; bio->bi_opf |= req_op(rq); diff --git a/block/bounce.c b/block/bounce.c index 87983a35079c..6c441f4f1cd4 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -229,10 +229,10 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) * - The point of cloning the biovec is to produce a bio with a biovec * the caller can modify: bi_idx and bi_bvec_done should be 0. * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * - The original bio could've had more than BIO_MAX_VECS biovecs; if * we tried to clone the whole thing bio_alloc_bioset() would fail. * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. + * actually need to allocate is fewer than BIO_MAX_VECS. * * - Lastly, bi_vcnt should not be looked at or relied upon by code * that does not own the bio - reason being drivers don't use it for @@ -299,7 +299,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, int sectors = 0; bio_for_each_segment(from, *bio_orig, iter) { - if (i++ < BIO_MAX_PAGES) + if (i++ < BIO_MAX_VECS) sectors += from.bv_len >> 9; if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) bounce = true; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 7d9cc433b758..5d9181382ce1 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1324,7 +1324,7 @@ struct bm_extent { * A followup commit may allow even bigger BIO sizes, * once we thought that through. */ #define DRBD_MAX_BIO_SIZE (1U << 20) -#if DRBD_MAX_BIO_SIZE > (BIO_MAX_PAGES << PAGE_SHIFT) +#if DRBD_MAX_BIO_SIZE > (BIO_MAX_VECS << PAGE_SHIFT) #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE #endif #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 71691f32959b..03e1fe4de53d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -965,7 +965,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; - q->limits.max_segments = BIO_MAX_PAGES; + q->limits.max_segments = BIO_MAX_VECS; blk_queue_max_discard_sectors(q, UINT_MAX); q->limits.discard_granularity = 512; q->limits.io_min = block_size; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 11c105ecd165..b0ab080f2567 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -229,7 +229,7 @@ static DEFINE_SPINLOCK(dm_crypt_clients_lock); static unsigned dm_crypt_clients_n = 0; static volatile unsigned long dm_crypt_pages_per_client; #define DM_CRYPT_MEMORY_PERCENT 2 -#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_PAGES * 16) +#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_VECS * 16) static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); @@ -3246,7 +3246,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, ARCH_KMALLOC_MINALIGN); - ret = mempool_init(&cc->page_pool, BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc); + ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc); if (ret) { ti->error = "Cannot allocate page mempool"; goto bad; @@ -3373,9 +3373,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) /* * Check if bio is too large, split as needed. */ - if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) && + if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) && (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size)) - dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT)); + dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT)); /* * Ensure that bio is a multiple of internal sector encryption size diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 844c4be11768..4f72b6f66c3a 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1892,10 +1892,10 @@ restart: list_add(&g->lru, &wbl.list); wbl.size++; g->write_in_progress = true; - g->wc_list_contiguous = BIO_MAX_PAGES; + g->wc_list_contiguous = BIO_MAX_VECS; f = g; e->wc_list_contiguous++; - if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) { + if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { if (unlikely(wc->writeback_all)) { next_node = rb_next(&f->rb_node); if (likely(next_node)) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 4337ae0e6af2..0b5dcaabbc15 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -735,7 +735,7 @@ static void r5l_submit_current_io(struct r5l_log *log) static struct bio *r5l_bio_alloc(struct r5l_log *log) { - struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs); + struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &log->bs); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio_set_dev(bio, log->rdev->bdev); @@ -1634,7 +1634,7 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, { struct page *page; - ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs); + ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_VECS, &log->bs); if (!ctx->ra_bio) return -ENOMEM; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index e8c118e05dfd..3ddc2aa0b530 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -496,7 +496,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) { struct bio *prev = bio; - bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, + bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &ppl_conf->bs); bio->bi_opf = prev->bi_opf; bio->bi_write_hint = prev->bi_write_hint; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 26c587ccd152..2798944899b7 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -50,9 +50,9 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) /* * nvmet_passthru_map_sg is limitted to using a single bio so limit - * the mdts based on BIO_MAX_PAGES as well + * the mdts based on BIO_MAX_VECS as well */ - max_hw_sectors = min_not_zero(BIO_MAX_PAGES << (PAGE_SHIFT - 9), + max_hw_sectors = min_not_zero(BIO_MAX_VECS << (PAGE_SHIFT - 9), max_hw_sectors); page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; @@ -191,7 +191,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) struct bio *bio; int i; - if (req->sg_cnt > BIO_MAX_PAGES) + if (req->sg_cnt > BIO_MAX_VECS) return -EINVAL; if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 03166b3dea4d..92ed7d5df677 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -432,7 +432,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { bool polled = false; @@ -500,8 +500,8 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (!iov_iter_count(iter)) return 0; - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); + if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dfb3ead1175..db8cb98c020c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3048,7 +3048,7 @@ struct bio *btrfs_bio_alloc(u64 first_byte) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 582df11d298a..6daa4309c974 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1428,7 +1428,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, if (!first_page->dev->bdev) goto out; - bio = btrfs_io_bio_alloc(BIO_MAX_PAGES); + bio = btrfs_io_bio_alloc(BIO_MAX_VECS); bio_set_dev(bio, first_page->dev->bdev); for (page_num = 0; page_num < sblock->page_count; page_num++) { diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index b048a0e38516..68a2de6b5a9b 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -52,7 +52,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, int num_pages = 0; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ - bio = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS); while (len) { unsigned int blocks_this_page = min(len, blocks_per_page); @@ -74,7 +74,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, len -= blocks_this_page; lblk += blocks_this_page; pblk += blocks_this_page; - if (num_pages == BIO_MAX_PAGES || !len || + if (num_pages == BIO_MAX_VECS || !len || !fscrypt_mergeable_bio(bio, inode, lblk)) { err = submit_bio_wait(bio); if (err) @@ -126,7 +126,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk, len); - BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES); + BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); nr_pages = min_t(unsigned int, ARRAY_SIZE(pages), (len + blocks_per_page - 1) >> blocks_per_page_bits); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6cb356c4217b..3851e1a64f73 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1235,7 +1235,7 @@ submit_bio_retry: } if (!bio) { - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); bio->bi_end_io = z_erofs_decompressqueue_endio; bio_set_dev(bio, sb->s_bdev); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 03a44a0de86a..f038d578d8d8 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -398,7 +398,7 @@ static void io_submit_init_bio(struct ext4_io_submit *io, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 174a0819ad96..be5415a0dbbc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -292,7 +292,7 @@ void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); + f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7c95818639a6..4e5257c763d0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -857,7 +857,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); alloc_new: if (!bio) { - bio = __bio_alloc(fio, BIO_MAX_PAGES); + bio = __bio_alloc(fio, BIO_MAX_VECS); __attach_io_flag(fio); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, fio->page->index, fio, GFP_NOIO); @@ -932,7 +932,7 @@ alloc_new: fio->retry = true; goto skip; } - io->bio = __bio_alloc(fio, BIO_MAX_PAGES); + io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, bio_page->index, fio, GFP_NOIO); io->fio = *fio; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 993004f06a77..c2866561263e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4381,7 +4381,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) block_t total_node_blocks = 0; do { - readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, META_SIT, true); start = start_blk * sit_i->sents_per_block; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 229814b4f4a6..e9a7a637d688 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -851,7 +851,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * BIO_MAX_PAGES; + return 8 * BIO_MAX_VECS; else return 0; } @@ -868,7 +868,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - desired = BIO_MAX_PAGES; + desired = BIO_MAX_VECS; if (type == NODE) desired <<= 1; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7069793752f1..82592b19b4e0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -753,9 +753,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { f2fs_warn(sbi, "Not support %d, larger than %d", - 1 << arg, BIO_MAX_PAGES); + 1 << arg, BIO_MAX_VECS); return -EINVAL; } F2FS_OPTION(sbi).write_io_size_bits = arg; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index dc1b93a877c6..a82f4747aa8d 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -267,7 +267,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, bio_end_io_t *end_io) { struct super_block *sb = sdp->sd_vfs; - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift; bio_set_dev(bio, sb->s_bdev); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7ffcd7ef33d4..414769a6ad11 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1221,7 +1221,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset); bio_set_dev(bio, wpc->iomap.bdev); bio->bi_iter.bi_sector = sector; bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); @@ -1252,7 +1252,7 @@ iomap_chain_bio(struct bio *prev) { struct bio *new; - new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + new = bio_alloc(GFP_NOFS, BIO_MAX_VECS); bio_copy_dev(new, prev);/* also copies over blkcg information */ new->bi_iter.bi_sector = bio_end_sector(prev); new->bi_opf = prev->bi_opf; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e2c4991833b8..bdd0d89bbf0a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -296,7 +296,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, */ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { size_t n; if (dio->error) { @@ -338,7 +338,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, copied += n; nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, - BIO_MAX_PAGES); + BIO_MAX_VECS); iomap_dio_submit_bio(dio, iomap, bio, pos); pos += n; } while (nr_pages); diff --git a/fs/mpage.c b/fs/mpage.c index 961234d68779..334e7d09aa65 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -616,7 +616,7 @@ alloc_new: goto out; } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); + BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1e75417bfe6e..56872e93823d 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -399,7 +399,7 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, { wi->bio = NULL; wi->rest_blocks = segbuf->sb_sum.nblocks; - wi->max_pages = BIO_MAX_PAGES; + wi->max_pages = BIO_MAX_VECS; wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end = 0; wi->blocknr = segbuf->sb_pseg_start; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 45f44425d856..b9e87ebb1060 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -87,7 +87,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - if (page_count <= BIO_MAX_PAGES) + if (page_count <= BIO_MAX_VECS) bio = bio_alloc(GFP_NOIO, page_count); else bio = bio_kmalloc(GFP_NOIO, page_count); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index b6ff4a21abac..0fe76f376dee 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -684,7 +684,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); iov_iter_truncate(from, max); - nr_pages = iov_iter_npages(from, BIO_MAX_PAGES); + nr_pages = iov_iter_npages(from, BIO_MAX_VECS); if (!nr_pages) return 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index 983ed2fe7c85..d0246c92a6e8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -20,11 +20,11 @@ #define BIO_BUG_ON #endif -#define BIO_MAX_PAGES 256U +#define BIO_MAX_VECS 256U static inline unsigned int bio_max_segs(unsigned int nr_segs) { - return min(nr_segs, BIO_MAX_PAGES); + return min(nr_segs, BIO_MAX_VECS); } #define bio_prio(bio) (bio)->bi_ioprio -- cgit v1.2.3 From f053cf7aa66cd9d592b0fc967f4d887c2abff1b7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 2 Mar 2021 12:04:19 -0800 Subject: ext4: fix error handling in ext4_end_enable_verity() ext4 didn't properly clean up if verity failed to be enabled on a file: - It left verity metadata (pages past EOF) in the page cache, which would be exposed to userspace if the file was later extended. - It didn't truncate the verity metadata at all (either from cache or from disk) if an error occurred while setting the verity bit. Fix these bugs by adding a call to truncate_inode_pages() and ensuring that we truncate the verity metadata (both from cache and from disk) in all error paths. Also rework the code to cleanly separate the success path from the error paths, which makes it much easier to understand. Reported-by: Yunlei He Fixes: c93d8f885809 ("ext4: add basic fs-verity support") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20210302200420.137977-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/verity.c | 89 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 5b7ba8f71153..00e3cbde472e 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -201,55 +201,76 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc, struct inode *inode = file_inode(filp); const int credits = 2; /* superblock and inode for ext4_orphan_del() */ handle_t *handle; + struct ext4_iloc iloc; int err = 0; - int err2; - if (desc != NULL) { - /* Succeeded; write the verity descriptor. */ - err = ext4_write_verity_descriptor(inode, desc, desc_size, - merkle_tree_size); - - /* Write all pages before clearing VERITY_IN_PROGRESS. */ - if (!err) - err = filemap_write_and_wait(inode->i_mapping); - } + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; - /* If we failed, truncate anything we wrote past i_size. */ - if (desc == NULL || err) - ext4_truncate(inode); + /* Append the verity descriptor. */ + err = ext4_write_verity_descriptor(inode, desc, desc_size, + merkle_tree_size); + if (err) + goto cleanup; /* - * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and - * deleting the inode from the orphan list, even if something failed. - * If everything succeeded, we'll also set the verity bit in the same - * transaction. + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages + * beyond i_size won't be written properly. For crash consistency, this + * also must happen before the verity inode flag gets persisted. */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; - ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + /* + * Finally, set the verity inode flag and remove the inode from the + * orphan list (in a single transaction). + */ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits); if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); - return PTR_ERR(handle); + err = PTR_ERR(handle); + goto cleanup; } - err2 = ext4_orphan_del(handle, inode); - if (err2) - goto out_stop; + err = ext4_orphan_del(handle, inode); + if (err) + goto stop_and_cleanup; - if (desc != NULL && !err) { - struct ext4_iloc iloc; + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto stop_and_cleanup; - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out_stop; - ext4_set_inode_flag(inode, EXT4_INODE_VERITY); - ext4_set_inode_flags(inode, false); - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - } -out_stop: + ext4_set_inode_flag(inode, EXT4_INODE_VERITY); + ext4_set_inode_flags(inode, false); + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto stop_and_cleanup; + + ext4_journal_stop(handle); + + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + return 0; + +stop_and_cleanup: ext4_journal_stop(handle); - return err ?: err2; +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk), removing the inode from the orphan list (if it wasn't done + * already), and clearing EXT4_STATE_VERITY_IN_PROGRESS. + */ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); + ext4_orphan_del(NULL, inode); + ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); + return err; } static int ext4_get_verity_descriptor_location(struct inode *inode, -- cgit v1.2.3 From b4250dd868d1b42c0a65de11ef3afbee67ba5d2f Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 11 Mar 2021 10:55:00 -0500 Subject: NFSD: fix error handling in NFSv4.0 callbacks When the server tries to do a callback and a client fails it due to authentication problems, we need the server to set callback down flag in RENEW so that client can recover. Suggested-by: Bruce Fields Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever Tested-by: Benjamin Coddington Link: https://lore.kernel.org/linux-nfs/FB84E90A-1A03-48B3-8BF7-D9D10AC2C9FE@oracle.com/T/#t --- fs/nfsd/nfs4callback.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 052be5bf9ef5..7325592b456e 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1189,6 +1189,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case -EIO: case -ETIMEDOUT: + case -EACCES: nfsd4_mark_cb_down(clp, task->tk_status); } break; -- cgit v1.2.3 From 5c2469e0a22e035d52f3ba768151cc75e3d4a1cd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Mar 2021 10:17:56 -0700 Subject: io_uring: force creation of separate context for ATTACH_WQ and non-threads Earlier kernels had SQPOLL threads that could share across anything, as we grabbed the context we needed on a per-ring basis. This is no longer the case, so only allow attaching directly if we're in the same thread group. That is the common use case. For non-group tasks, just setup a new context and thread as we would've done if sharing wasn't set. This isn't 100% ideal in terms of CPU utilization for the forked and share case, but hopefully that isn't much of a concern. If it is, there are plans in motion for how to improve that. Most importantly, we want to avoid app side regressions where sharing worked before and now doesn't. With this patch, functionality is equivalent to previous kernels that supported IORING_SETUP_ATTACH_WQ with SQPOLL. Reported-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5c045a9f7ffe..472eab7359f2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -269,6 +269,7 @@ struct io_sq_data { unsigned sq_thread_idle; int sq_cpu; pid_t task_pid; + pid_t task_tgid; unsigned long state; struct completion startup; @@ -7112,6 +7113,10 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) fdput(f); return ERR_PTR(-EINVAL); } + if (sqd->task_tgid != current->tgid) { + fdput(f); + return ERR_PTR(-EPERM); + } refcount_inc(&sqd->refs); fdput(f); @@ -7122,8 +7127,14 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) { struct io_sq_data *sqd; - if (p->flags & IORING_SETUP_ATTACH_WQ) - return io_attach_sq_data(p); + if (p->flags & IORING_SETUP_ATTACH_WQ) { + sqd = io_attach_sq_data(p); + if (!IS_ERR(sqd)) + return sqd; + /* fall through for EPERM case, setup new sqd/task */ + if (PTR_ERR(sqd) != -EPERM) + return sqd; + } sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); if (!sqd) @@ -7833,6 +7844,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->task_pid = current->pid; + sqd->task_tgid = current->tgid; tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); if (IS_ERR(tsk)) { ret = PTR_ERR(tsk); -- cgit v1.2.3 From d052d1d685f5125249ab4ff887562c88ba959638 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Mar 2021 10:49:20 -0700 Subject: io_uring: perform IOPOLL reaping if canceler is thread itself We bypass IOPOLL completion polling (and reaping) for the SQPOLL thread, but if it's the thread itself invoking cancelations, then we still need to perform it or no one will. Fixes: 9936c7c2bc76 ("io_uring: deduplicate core cancellations sequence") Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 472eab7359f2..49f85f49e1c3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8658,7 +8658,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } /* SQPOLL thread does its own polling */ - if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) { + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) || + (ctx->sq_data && ctx->sq_data->thread == current)) { while (!list_empty_careful(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; -- cgit v1.2.3 From 4f8be1f53bf615102d103c0509ffa9596f65b718 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 28 Jan 2021 17:36:38 -0500 Subject: nfs: we don't support removing system.nfs4_acl The NFSv4 protocol doesn't have any notion of reomoving an attribute, so removexattr(path,"system.nfs4_acl") doesn't make sense. There's no documented return value. Arguably it could be EOPNOTSUPP but I'm a little worried an application might take that to mean that we don't support ACLs or xattrs. How about EINVAL? Signed-off-by: J. Bruce Fields Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1002c4f66f3f..c65c4b41e2c1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5889,6 +5889,9 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); int ret, i; + /* You can't remove system.nfs4_acl: */ + if (buflen == 0) + return -EINVAL; if (!nfs4_server_supports_acls(server)) return -EOPNOTSUPP; if (npages > ARRAY_SIZE(pages)) -- cgit v1.2.3 From d5bf630f355d8c532bef2347cf90e8ae60a5f1bd Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 12 Mar 2021 07:58:54 -0500 Subject: gfs2: bypass signal_our_withdraw if no journal Before this patch, function signal_our_withdraw referenced the journal inode immediately. But corrupt file systems may have some invalid journals, in which case our attempt to read it in will withdraw and the resulting signal_our_withdraw would dereference the NULL value. This patch adds a check to signal_our_withdraw so that if the journal has not yet been initialized, it simply returns and does the old-style withdraw. Thanks, Andy Price, for his analysis. Reported-by: syzbot+50a8a9cf8127f2c6f5df@syzkaller.appspotmail.com Fixes: 601ef0d52e96 ("gfs2: Force withdraw to replay journals and wait for it to finish") Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/util.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 58743315cda9..4f034b87b427 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -119,17 +119,22 @@ void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh) static void signal_our_withdraw(struct gfs2_sbd *sdp) { struct gfs2_glock *live_gl = sdp->sd_live_gh.gh_gl; - struct inode *inode = sdp->sd_jdesc->jd_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_glock *i_gl = ip->i_gl; - u64 no_formal_ino = ip->i_no_formal_ino; + struct inode *inode; + struct gfs2_inode *ip; + struct gfs2_glock *i_gl; + u64 no_formal_ino; int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); int ret = 0; int tries; - if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || !sdp->sd_jdesc) return; + inode = sdp->sd_jdesc->jd_inode; + ip = GFS2_I(inode); + i_gl = ip->i_gl; + no_formal_ino = ip->i_no_formal_ino; + /* Prevent any glock dq until withdraw recovery is complete */ set_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); /* -- cgit v1.2.3 From e1915f76a8981f0a750cf56515df42582a37c4b0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 11 Mar 2021 23:29:35 +0000 Subject: io_uring: cancel deferred requests in try_cancel As io_uring_cancel_files() and others let SQO to run between io_uring_try_cancel_requests(), SQO may generate new deferred requests, so it's safer to try to cancel them in it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 49f85f49e1c3..56f3d8f408c9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8577,11 +8577,11 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) return ret; } -static void io_cancel_defer_files(struct io_ring_ctx *ctx, +static bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) { - struct io_defer_entry *de = NULL; + struct io_defer_entry *de; LIST_HEAD(list); spin_lock_irq(&ctx->completion_lock); @@ -8592,6 +8592,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } spin_unlock_irq(&ctx->completion_lock); + if (list_empty(&list)) + return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); @@ -8601,6 +8603,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, io_req_complete(de->req, -ECANCELED); kfree(de); } + return true; } static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) @@ -8666,6 +8669,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } + ret |= io_cancel_defer_files(ctx, task, files); ret |= io_poll_remove_all(ctx, task, files); ret |= io_kill_timeouts(ctx, task, files); ret |= io_run_task_work(); @@ -8734,8 +8738,6 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, atomic_inc(&task->io_uring->in_idle); } - io_cancel_defer_files(ctx, task, files); - io_uring_cancel_files(ctx, task, files); if (!files) io_uring_try_cancel_requests(ctx, task, NULL); -- cgit v1.2.3 From 0df8ea602b3fe80819a34361027ad40485e78909 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 11 Mar 2021 23:29:36 +0000 Subject: io_uring: remove useless ->startup completion We always do complete(&sqd->startup) almost right after sqd->thread creation, either in the success path or in io_sq_thread_finish(). It's specifically created not started for us to be able to set some stuff like sqd->thread and io_uring_alloc_task_context() before following right after wake_up_new_task(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 56f3d8f408c9..6349374d715d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -272,7 +272,6 @@ struct io_sq_data { pid_t task_tgid; unsigned long state; - struct completion startup; struct completion exited; }; @@ -6656,8 +6655,6 @@ static int io_sq_thread(void *data) set_cpus_allowed_ptr(current, cpu_online_mask); current->flags |= PF_NO_SETAFFINITY; - wait_for_completion(&sqd->startup); - down_read(&sqd->rw_lock); while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { @@ -7080,7 +7077,6 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) struct io_sq_data *sqd = ctx->sq_data; if (sqd) { - complete(&sqd->startup); io_sq_thread_park(sqd); list_del(&ctx->sqd_list); io_sqd_update_thread_idle(sqd); @@ -7144,7 +7140,6 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) INIT_LIST_HEAD(&sqd->ctx_list); init_rwsem(&sqd->rw_lock); init_waitqueue_head(&sqd->wait); - init_completion(&sqd->startup); init_completion(&sqd->exited); return sqd; } @@ -7856,7 +7851,6 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, wake_up_new_task(tsk); if (ret) goto err; - complete(&sqd->startup); } else if (p->flags & IORING_SETUP_SQ_AFF) { /* Can't have SQ_AFF without SQPOLL */ ret = -EINVAL; -- cgit v1.2.3 From 0efc4976e3da40b09c592b21f722022d8f12a16b Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 12 Mar 2021 08:47:47 -0500 Subject: gfs2: bypass log flush if the journal is not live Patch fe3e397668775 ("gfs2: Rework the log space allocation logic") changed gfs2_log_flush to reserve a set of journal blocks in case no transaction is active. However, gfs2_log_flush also gets called in cases where we don't have an active journal, for example, for spectator mounts. In that case, trying to reserve blocks would sleep forever, but we want gfs2_log_flush to be a no-op instead. Fixes: fe3e397668775 ("gfs2: Rework the log space allocation logic") Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 760af666576c..6410281546f9 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -1036,7 +1036,7 @@ repeat: * Do this check while holding the log_flush_lock to prevent new * buffers from being added to the ail via gfs2_pin() */ - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawn(sdp) || !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) goto out; /* Log might have been flushed while we waited for the flush lock */ -- cgit v1.2.3 From 26984fbf3ad9d1c1fb56a0c1e0cdf9fa3b806f0c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 11 Mar 2021 23:29:37 +0000 Subject: io_uring: prevent racy sqd->thread checks SQPOLL thread to which we're trying to attach may be going away, it's not nice but a more serious problem is if io_sq_offload_create() sees sqd->thread==NULL, and tries to init it with a new thread. There are tons of ways it can be exploited or fail. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6349374d715d..cdec59510433 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7119,14 +7119,18 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) return sqd; } -static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) +static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, + bool *attached) { struct io_sq_data *sqd; + *attached = false; if (p->flags & IORING_SETUP_ATTACH_WQ) { sqd = io_attach_sq_data(p); - if (!IS_ERR(sqd)) + if (!IS_ERR(sqd)) { + *attached = true; return sqd; + } /* fall through for EPERM case, setup new sqd/task */ if (PTR_ERR(sqd) != -EPERM) return sqd; @@ -7799,12 +7803,13 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { struct task_struct *tsk; struct io_sq_data *sqd; + bool attached; ret = -EPERM; if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) goto err; - sqd = io_get_sq_data(p); + sqd = io_get_sq_data(p, &attached); if (IS_ERR(sqd)) { ret = PTR_ERR(sqd); goto err; @@ -7816,13 +7821,24 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; + ret = 0; io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_list); - io_sqd_update_thread_idle(sqd); + /* don't attach to a dying SQPOLL thread, would be racy */ + if (attached && !sqd->thread) { + ret = -ENXIO; + } else { + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); + } io_sq_thread_unpark(sqd); - if (sqd->thread) + if (ret < 0) { + io_put_sq_data(sqd); + ctx->sq_data = NULL; + return ret; + } else if (attached) { return 0; + } if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; -- cgit v1.2.3 From 521d6a737a31c08dbab204a95cd4fb5bee725f0f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 11 Mar 2021 23:29:38 +0000 Subject: io_uring: cancel sqpoll via task_work 1) The first problem is io_uring_cancel_sqpoll() -> io_uring_cancel_task_requests() basically doing park(); park(); and so hanging. 2) Another one is more subtle, when the master task is doing cancellations, but SQPOLL task submits in-between the end of the cancellation but before finish() requests taking a ref to the ctx, and so eternally locking it up. 3) Yet another is a dying SQPOLL task doing io_uring_cancel_sqpoll() and same io_uring_cancel_sqpoll() from the owner task, they race for tctx->wait events. And there probably more of them. Instead do SQPOLL cancellations from within SQPOLL task context via task_work, see io_sqpoll_cancel_sync(). With that we don't need temporal park()/unpark() during cancellation, which is ugly, subtle and anyway doesn't allow to do io_run_task_work() properly. io_uring_cancel_sqpoll() is called only from SQPOLL task context and under sqd locking, so all parking is removed from there. And so, io_sq_thread_[un]park() and io_sq_thread_stop() are not used now by SQPOLL task, and that spare us from some headache. Also remove ctx->sqd_list early to avoid 2). And kill tctx->sqpoll, which is not used anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 151 +++++++++++++++++++++++++++------------------------------- 1 file changed, 71 insertions(+), 80 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index cdec59510433..70286b393c0e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6665,6 +6665,7 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); cond_resched(); down_read(&sqd->rw_lock); + io_run_task_work(); timeout = jiffies + sqd->sq_thread_idle; continue; } @@ -6720,18 +6721,22 @@ static int io_sq_thread(void *data) finish_wait(&sqd->wait, &wait); timeout = jiffies + sqd->sq_thread_idle; } - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_uring_cancel_sqpoll(ctx); up_read(&sqd->rw_lock); - + down_write(&sqd->rw_lock); + /* + * someone may have parked and added a cancellation task_work, run + * it first because we don't want it in io_uring_cancel_sqpoll() + */ io_run_task_work(); - down_write(&sqd->rw_lock); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_uring_cancel_sqpoll(ctx); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); up_write(&sqd->rw_lock); + + io_run_task_work(); complete(&sqd->exited); do_exit(0); } @@ -7033,8 +7038,8 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_unpark(struct io_sq_data *sqd) __releases(&sqd->rw_lock) { - if (sqd->thread == current) - return; + WARN_ON_ONCE(sqd->thread == current); + clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); up_write(&sqd->rw_lock); } @@ -7042,8 +7047,8 @@ static void io_sq_thread_unpark(struct io_sq_data *sqd) static void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->rw_lock) { - if (sqd->thread == current) - return; + WARN_ON_ONCE(sqd->thread == current); + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); down_write(&sqd->rw_lock); /* set again for consistency, in case concurrent parks are happening */ @@ -7054,8 +7059,8 @@ static void io_sq_thread_park(struct io_sq_data *sqd) static void io_sq_thread_stop(struct io_sq_data *sqd) { - if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) - return; + WARN_ON_ONCE(sqd->thread == current); + down_write(&sqd->rw_lock); set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); if (sqd->thread) @@ -7078,7 +7083,7 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) if (sqd) { io_sq_thread_park(sqd); - list_del(&ctx->sqd_list); + list_del_init(&ctx->sqd_list); io_sqd_update_thread_idle(sqd); io_sq_thread_unpark(sqd); @@ -7760,7 +7765,6 @@ static int io_uring_alloc_task_context(struct task_struct *task, init_waitqueue_head(&tctx->wait); tctx->last = NULL; atomic_set(&tctx->in_idle, 0); - tctx->sqpoll = false; task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -8719,43 +8723,12 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, io_uring_try_cancel_requests(ctx, task, files); - if (ctx->sq_data) - io_sq_thread_unpark(ctx->sq_data); prepare_to_wait(&task->io_uring->wait, &wait, TASK_UNINTERRUPTIBLE); if (inflight == io_uring_count_inflight(ctx, task, files)) schedule(); finish_wait(&task->io_uring->wait, &wait); - if (ctx->sq_data) - io_sq_thread_park(ctx->sq_data); - } -} - -/* - * We need to iteratively cancel requests, in case a request has dependent - * hard links. These persist even for failure of cancelations, hence keep - * looping until none are found. - */ -static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, - struct files_struct *files) -{ - struct task_struct *task = current; - - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { - io_sq_thread_park(ctx->sq_data); - task = ctx->sq_data->thread; - if (task) - atomic_inc(&task->io_uring->in_idle); } - - io_uring_cancel_files(ctx, task, files); - if (!files) - io_uring_try_cancel_requests(ctx, task, NULL); - - if (task) - atomic_dec(&task->io_uring->in_idle); - if (ctx->sq_data) - io_sq_thread_unpark(ctx->sq_data); } /* @@ -8796,15 +8769,6 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx) } tctx->last = ctx; } - - /* - * This is race safe in that the task itself is doing this, hence it - * cannot be going through the exit/cancel paths at the same time. - * This cannot be modified while exit/cancel is running. - */ - if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL)) - tctx->sqpoll = true; - return 0; } @@ -8847,6 +8811,44 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) } } +static s64 tctx_inflight(struct io_uring_task *tctx) +{ + return percpu_counter_sum(&tctx->inflight); +} + +static void io_sqpoll_cancel_cb(struct callback_head *cb) +{ + struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work); + struct io_ring_ctx *ctx = work->ctx; + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd->thread) + io_uring_cancel_sqpoll(ctx); + complete(&work->completion); +} + +static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + struct io_tctx_exit work = { .ctx = ctx, }; + struct task_struct *task; + + io_sq_thread_park(sqd); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); + task = sqd->thread; + if (task) { + init_completion(&work.completion); + init_task_work(&work.task_work, io_sqpoll_cancel_cb); + WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL)); + wake_up_process(task); + } + io_sq_thread_unpark(sqd); + + if (task) + wait_for_completion(&work.completion); +} + void __io_uring_files_cancel(struct files_struct *files) { struct io_uring_task *tctx = current->io_uring; @@ -8855,41 +8857,40 @@ void __io_uring_files_cancel(struct files_struct *files) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); - xa_for_each(&tctx->xa, index, node) - io_uring_cancel_task_requests(node->ctx, files); + xa_for_each(&tctx->xa, index, node) { + struct io_ring_ctx *ctx = node->ctx; + + if (ctx->sq_data) { + io_sqpoll_cancel_sync(ctx); + continue; + } + io_uring_cancel_files(ctx, current, files); + if (!files) + io_uring_try_cancel_requests(ctx, current, NULL); + } atomic_dec(&tctx->in_idle); if (files) io_uring_clean_tctx(tctx); } -static s64 tctx_inflight(struct io_uring_task *tctx) -{ - return percpu_counter_sum(&tctx->inflight); -} - +/* should only be called by SQPOLL task */ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) { struct io_sq_data *sqd = ctx->sq_data; - struct io_uring_task *tctx; + struct io_uring_task *tctx = current->io_uring; s64 inflight; DEFINE_WAIT(wait); - if (!sqd) - return; - io_sq_thread_park(sqd); - if (!sqd->thread || !sqd->thread->io_uring) { - io_sq_thread_unpark(sqd); - return; - } - tctx = ctx->sq_data->thread->io_uring; + WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current); + atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); if (!inflight) break; - io_uring_cancel_task_requests(ctx, NULL); + io_uring_try_cancel_requests(ctx, current, NULL); prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); /* @@ -8902,7 +8903,6 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) finish_wait(&tctx->wait, &wait); } while (1); atomic_dec(&tctx->in_idle); - io_sq_thread_unpark(sqd); } /* @@ -8917,15 +8917,6 @@ void __io_uring_task_cancel(void) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); - - if (tctx->sqpoll) { - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) - io_uring_cancel_sqpoll(node->ctx); - } - do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); -- cgit v1.2.3 From 58f99373834151e1ca7edc49bc5578d9d40db099 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 12 Mar 2021 16:25:55 +0000 Subject: io_uring: fix OP_ASYNC_CANCEL across tasks IORING_OP_ASYNC_CANCEL tries io-wq cancellation only for current task. If it fails go over tctx_list and try it out for every single tctx. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 70286b393c0e..a4bce17af506 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5667,8 +5667,47 @@ static int io_async_cancel_prep(struct io_kiocb *req, static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + u64 sqe_addr = req->cancel.addr; + struct io_tctx_node *node; + int ret; - io_async_find_and_cancel(ctx, req, req->cancel.addr, 0); + /* tasks should wait for their io-wq threads, so safe w/o sync */ + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); + spin_lock_irq(&ctx->completion_lock); + if (ret != -ENOENT) + goto done; + ret = io_timeout_cancel(ctx, sqe_addr); + if (ret != -ENOENT) + goto done; + ret = io_poll_cancel(ctx, sqe_addr); + if (ret != -ENOENT) + goto done; + spin_unlock_irq(&ctx->completion_lock); + + /* slow path, try all io-wq's */ + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + ret = -ENOENT; + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + if (!tctx || !tctx->io_wq) + continue; + ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); + if (ret != -ENOENT) + break; + } + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + + spin_lock_irq(&ctx->completion_lock); +done: + io_cqring_fill_event(req, ret); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + + if (ret < 0) + req_set_fail_links(req); + io_put_req(req); return 0; } -- cgit v1.2.3 From 16efa4fce3b7af17bb45d635c3e89992d721e0f3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Mar 2021 20:26:13 -0700 Subject: io_uring: allow IO worker threads to be frozen With the freezer using the proper signaling to notify us of when it's time to freeze a thread, we can re-enable normal freezer usage for the IO threads. Ensure that SQPOLL, io-wq, and the io-wq manager call try_to_freeze() appropriately, and remove the default setting of PF_NOFREEZE from create_io_thread(). Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 +++++- fs/io_uring.c | 1 + kernel/fork.c | 1 - 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 0ae9ecadf295..e05f996d088f 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -488,6 +488,8 @@ static int io_wqe_worker(void *data) set_task_comm(current, buf); while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { + long ret; + set_current_state(TASK_INTERRUPTIBLE); loop: raw_spin_lock_irq(&wqe->lock); @@ -498,7 +500,8 @@ loop: __io_worker_idle(wqe, worker); raw_spin_unlock_irq(&wqe->lock); io_flush_signals(); - if (schedule_timeout(WORKER_IDLE_TIMEOUT)) + ret = schedule_timeout(WORKER_IDLE_TIMEOUT); + if (try_to_freeze() || ret) continue; if (fatal_signal_pending(current)) break; @@ -709,6 +712,7 @@ static int io_wq_manager(void *data) set_current_state(TASK_INTERRUPTIBLE); io_wq_check_workers(wq); schedule_timeout(HZ); + try_to_freeze(); if (fatal_signal_pending(current)) set_bit(IO_WQ_BIT_EXIT, &wq->state); } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); diff --git a/fs/io_uring.c b/fs/io_uring.c index a4bce17af506..05adc4887ef3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6752,6 +6752,7 @@ static int io_sq_thread(void *data) up_read(&sqd->rw_lock); schedule(); + try_to_freeze(); down_read(&sqd->rw_lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); diff --git a/kernel/fork.c b/kernel/fork.c index 72e444cd0ffe..d3171e8e88e5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2436,7 +2436,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) if (!IS_ERR(tsk)) { sigfillset(&tsk->blocked); sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); - tsk->flags |= PF_NOFREEZE; } return tsk; } -- cgit v1.2.3 From ca6eb14d6453bea85ac66fa4c6ab75dfe93eaf45 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 12 Mar 2021 21:07:30 -0800 Subject: mm: use is_cow_mapping() across tree where proper After is_cow_mapping() is exported in mm.h, replace some manual checks elsewhere throughout the tree but start to use the new helper. Link: https://lkml.kernel.org/r/20210217233547.93892-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Cc: VMware Graphics Cc: Roland Scheidegger Cc: David Airlie Cc: Daniel Vetter Cc: Mike Kravetz Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Christoph Hellwig Cc: David Gibson Cc: Gal Pressman Cc: Jan Kara Cc: Jann Horn Cc: Kirill Shutemov Cc: Kirill Tkhai Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Rapoport Cc: Wei Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 4 +--- drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c | 2 +- fs/proc/task_mmu.c | 2 -- mm/hugetlb.c | 4 +--- 4 files changed, 3 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c index 0a900afc66ff..45c9c6a7f1d6 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c @@ -500,8 +500,6 @@ vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf, vm_fault_t ret; pgoff_t fault_page_size; bool write = vmf->flags & FAULT_FLAG_WRITE; - bool is_cow_mapping = - (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; switch (pe_size) { case PE_SIZE_PMD: @@ -518,7 +516,7 @@ vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf, } /* Always do write dirty-tracking and COW on PTE level. */ - if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping)) + if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping(vma->vm_flags))) return VM_FAULT_FALLBACK; ret = ttm_bo_vm_reserve(bo, vmf); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c index 3c03b1746661..cb9975889e2f 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c @@ -49,7 +49,7 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &vmw_vm_ops; /* Use VM_PFNMAP rather than VM_MIXEDMAP if not a COW mapping */ - if ((vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) != VM_MAYWRITE) + if (!is_cow_mapping(vma->vm_flags)) vma->vm_flags = (vma->vm_flags & ~VM_MIXEDMAP) | VM_PFNMAP; return 0; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3cec6fbef725..e862cab69583 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1036,8 +1036,6 @@ struct clear_refs_private { #ifdef CONFIG_MEM_SOFT_DIRTY -#define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) - static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct page *page; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b905143a523e..7786267da2fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3734,15 +3734,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; - int cow; + bool cow = is_cow_mapping(vma->vm_flags); struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct address_space *mapping = vma->vm_file->f_mapping; struct mmu_notifier_range range; int ret = 0; - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - if (cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, vma->vm_start, -- cgit v1.2.3 From e7850f4d844e0acfac7e570af611d89deade3146 Mon Sep 17 00:00:00 2001 From: Lior Ribak Date: Fri, 12 Mar 2021 21:07:41 -0800 Subject: binfmt_misc: fix possible deadlock in bm_register_write There is a deadlock in bm_register_write: First, in the begining of the function, a lock is taken on the binfmt_misc root inode with inode_lock(d_inode(root)). Then, if the user used the MISC_FMT_OPEN_FILE flag, the function will call open_exec on the user-provided interpreter. open_exec will call a path lookup, and if the path lookup process includes the root of binfmt_misc, it will try to take a shared lock on its inode again, but it is already locked, and the code will get stuck in a deadlock To reproduce the bug: $ echo ":iiiii:E::ii::/proc/sys/fs/binfmt_misc/bla:F" > /proc/sys/fs/binfmt_misc/register backtrace of where the lock occurs (#5): 0 schedule () at ./arch/x86/include/asm/current.h:15 1 0xffffffff81b51237 in rwsem_down_read_slowpath (sem=0xffff888003b202e0, count=, state=state@entry=2) at kernel/locking/rwsem.c:992 2 0xffffffff81b5150a in __down_read_common (state=2, sem=) at kernel/locking/rwsem.c:1213 3 __down_read (sem=) at kernel/locking/rwsem.c:1222 4 down_read (sem=) at kernel/locking/rwsem.c:1355 5 0xffffffff811ee22a in inode_lock_shared (inode=) at ./include/linux/fs.h:783 6 open_last_lookups (op=0xffffc9000022fe34, file=0xffff888004098600, nd=0xffffc9000022fd10) at fs/namei.c:3177 7 path_openat (nd=nd@entry=0xffffc9000022fd10, op=op@entry=0xffffc9000022fe34, flags=flags@entry=65) at fs/namei.c:3366 8 0xffffffff811efe1c in do_filp_open (dfd=, pathname=pathname@entry=0xffff8880031b9000, op=op@entry=0xffffc9000022fe34) at fs/namei.c:3396 9 0xffffffff811e493f in do_open_execat (fd=fd@entry=-100, name=name@entry=0xffff8880031b9000, flags=, flags@entry=0) at fs/exec.c:913 10 0xffffffff811e4a92 in open_exec (name=) at fs/exec.c:948 11 0xffffffff8124aa84 in bm_register_write (file=, buffer=, count=19, ppos=) at fs/binfmt_misc.c:682 12 0xffffffff811decd2 in vfs_write (file=file@entry=0xffff888004098500, buf=buf@entry=0xa758d0 ":iiiii:E::ii::i:CF ", count=count@entry=19, pos=pos@entry=0xffffc9000022ff10) at fs/read_write.c:603 13 0xffffffff811defda in ksys_write (fd=, buf=0xa758d0 ":iiiii:E::ii::i:CF ", count=19) at fs/read_write.c:658 14 0xffffffff81b49813 in do_syscall_64 (nr=, regs=0xffffc9000022ff58) at arch/x86/entry/common.c:46 15 0xffffffff81c0007c in entry_SYSCALL_64 () at arch/x86/entry/entry_64.S:120 To solve the issue, the open_exec call is moved to before the write lock is taken by bm_register_write Link: https://lkml.kernel.org/r/20210228224414.95962-1-liorribak@gmail.com Fixes: 948b701a607f1 ("binfmt_misc: add persistent opened binary handler for containers") Signed-off-by: Lior Ribak Acked-by: Helge Deller Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index c457334de43f..e1eae7ea823a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -649,12 +649,24 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, struct super_block *sb = file_inode(file)->i_sb; struct dentry *root = sb->s_root, *dentry; int err = 0; + struct file *f = NULL; e = create_entry(buffer, count); if (IS_ERR(e)) return PTR_ERR(e); + if (e->flags & MISC_FMT_OPEN_FILE) { + f = open_exec(e->interpreter); + if (IS_ERR(f)) { + pr_notice("register: failed to install interpreter file %s\n", + e->interpreter); + kfree(e); + return PTR_ERR(f); + } + e->interp_file = f; + } + inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); @@ -678,21 +690,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, goto out2; } - if (e->flags & MISC_FMT_OPEN_FILE) { - struct file *f; - - f = open_exec(e->interpreter); - if (IS_ERR(f)) { - err = PTR_ERR(f); - pr_notice("register: failed to install interpreter file %s\n", e->interpreter); - simple_release_fs(&bm_mnt, &entry_count); - iput(inode); - inode = NULL; - goto out2; - } - e->interp_file = f; - } - e->dentry = dget(dentry); inode->i_private = e; inode->i_fop = &bm_entry_operations; @@ -709,6 +706,8 @@ out: inode_unlock(d_inode(root)); if (err) { + if (f) + filp_close(f, NULL); kfree(e); return err; } -- cgit v1.2.3 From 9e15c3a0ced5a61f320b989072c24983cb1620c1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 13 Mar 2021 12:29:43 -0700 Subject: io_uring: convert io_buffer_idr to XArray Like we did for the personality idr, convert the IO buffer idr to use XArray. This avoids a use-after-free on removal of entries, since idr doesn't like doing so from inside an iterator, and it nicely reduces the amount of code we need to support this feature. Fixes: 5a2e745d4d43 ("io_uring: buffer registration infrastructure") Cc: stable@vger.kernel.org Cc: Matthew Wilcox Cc: yangerkun Reported-by: Hulk Robot Signed-off-by: Jens Axboe --- fs/io_uring.c | 43 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 05adc4887ef3..58d62dd9f8e4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -402,7 +402,7 @@ struct io_ring_ctx { struct socket *ring_sock; #endif - struct idr io_buffer_idr; + struct xarray io_buffers; struct xarray personalities; u32 pers_next; @@ -1135,7 +1135,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); - idr_init(&ctx->io_buffer_idr); + xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); @@ -2843,7 +2843,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, lockdep_assert_held(&req->ctx->uring_lock); - head = idr_find(&req->ctx->io_buffer_idr, bgid); + head = xa_load(&req->ctx->io_buffers, bgid); if (head) { if (!list_empty(&head->list)) { kbuf = list_last_entry(&head->list, struct io_buffer, @@ -2851,7 +2851,7 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, list_del(&kbuf->list); } else { kbuf = head; - idr_remove(&req->ctx->io_buffer_idr, bgid); + xa_erase(&req->ctx->io_buffers, bgid); } if (*len > kbuf->len) *len = kbuf->len; @@ -3892,7 +3892,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, } i++; kfree(buf); - idr_remove(&ctx->io_buffer_idr, bgid); + xa_erase(&ctx->io_buffers, bgid); return i; } @@ -3910,7 +3910,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); ret = -ENOENT; - head = idr_find(&ctx->io_buffer_idr, p->bgid); + head = xa_load(&ctx->io_buffers, p->bgid); if (head) ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); if (ret < 0) @@ -3993,21 +3993,14 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) lockdep_assert_held(&ctx->uring_lock); - list = head = idr_find(&ctx->io_buffer_idr, p->bgid); + list = head = xa_load(&ctx->io_buffers, p->bgid); ret = io_add_buffers(p, &head); - if (ret < 0) - goto out; - - if (!list) { - ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1, - GFP_KERNEL); - if (ret < 0) { + if (ret >= 0 && !list) { + ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); + if (ret < 0) __io_remove_buffers(ctx, head, p->bgid, -1U); - goto out; - } } -out: if (ret < 0) req_set_fail_links(req); @@ -8333,19 +8326,13 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } -static int __io_destroy_buffers(int id, void *p, void *data) -{ - struct io_ring_ctx *ctx = data; - struct io_buffer *buf = p; - - __io_remove_buffers(ctx, buf, id, -1U); - return 0; -} - static void io_destroy_buffers(struct io_ring_ctx *ctx) { - idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx); - idr_destroy(&ctx->io_buffer_idr); + struct io_buffer *buf; + unsigned long index; + + xa_for_each(&ctx->io_buffers, index, buf) + __io_remove_buffers(ctx, buf, index, -1U); } static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) -- cgit v1.2.3 From 5171317dfd9afcf729799d31fffdbb9e71e45402 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 10 Mar 2021 10:22:27 +0000 Subject: cifs: update new ACE pointer after populate_new_aces. After the fix for retaining externally set ACEs with cifsacl and modefromsid,idsfromsid, there was an issue in populating the inherited ACEs after setting the ACEs introduced by these two modes. Fixed this by updating the ACE pointer again after the call to populate_new_aces. Signed-off-by: Shyam Prasad N Reviewed-by: Rohith Surabattula Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 9d29eb9660c2..2be22a5c690f 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1118,7 +1118,6 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, /* Retain old ACEs which we can retain */ for (i = 0; i < src_num_aces; ++i) { pntace = (struct cifs_ace *) (acl_base + size); - pnntace = (struct cifs_ace *) (nacl_base + nsize); if (!new_aces_set && (pntace->flags & INHERITED_ACE)) { /* Place the new ACEs in between existing explicit and inherited */ @@ -1131,14 +1130,18 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, } /* If it's any one of the ACE we're replacing, skip! */ - if ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || + if (!mode_from_sid && + ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || (compare_sids(&pntace->sid, pownersid) == 0) || (compare_sids(&pntace->sid, pgrpsid) == 0) || (compare_sids(&pntace->sid, &sid_everyone) == 0) || - (compare_sids(&pntace->sid, &sid_authusers) == 0)) { + (compare_sids(&pntace->sid, &sid_authusers) == 0))) { goto next_ace; } + /* update the pointer to the next ACE to populate*/ + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += cifs_copy_ace(pnntace, pntace, NULL); num_aces++; -- cgit v1.2.3 From 05946d4b7a7349ae58bfa2d51ae832e64a394c2d Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Wed, 10 Mar 2021 13:20:40 +0100 Subject: cifs: Fix preauth hash corruption smb311_update_preauth_hash() uses the shash in server->secmech without appropriate locking, and this can lead to sessions corrupting each other's preauth hashes. The following script can easily trigger the problem: #!/bin/sh -e NMOUNTS=10 for i in $(seq $NMOUNTS); mkdir -p /tmp/mnt$i umount /tmp/mnt$i 2>/dev/null || : done while :; do for i in $(seq $NMOUNTS); do mount -t cifs //192.168.0.1/test /tmp/mnt$i -o ... & done wait for i in $(seq $NMOUNTS); do umount /tmp/mnt$i done done Usually within seconds this leads to one or more of the mounts failing with the following errors, and a "Bad SMB2 signature for message" is seen in the server logs: CIFS: VFS: \\192.168.0.1 failed to connect to IPC (rc=-13) CIFS: VFS: cifs_mount failed w/return code = -13 Fix it by holding the server mutex just like in the other places where the shashes are used. Fixes: 8bd68c6e47abff34e4 ("CIFS: implement v3.11 preauth integrity") Signed-off-by: Vincent Whitchurch CC: Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/transport.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 007d99437c77..c1725b55f364 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1196,9 +1196,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { + mutex_lock(&server->srv_mutex); smb311_update_preauth_hash(ses, rqst[0].rq_iov, rqst[0].rq_nvec); + mutex_unlock(&server->srv_mutex); + } for (i = 0; i < num_rqst; i++) { rc = wait_for_response(server, midQ[i]); @@ -1266,7 +1269,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len }; + mutex_lock(&server->srv_mutex); smb311_update_preauth_hash(ses, &iov, 1); + mutex_unlock(&server->srv_mutex); } out: -- cgit v1.2.3 From 62dd0f98a0e5668424270b47a0c2e973795faba7 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Mon, 15 Mar 2021 12:24:00 +0000 Subject: gfs2: Flag a withdraw if init_threads() fails Interrupting mount with ^C quickly enough can cause the kthread_run() calls in gfs2's init_threads() to fail and the error path leads to a deadlock on the s_umount rwsem. The abridged chain of events is: [mount path] get_tree_bdev() sget_fc() alloc_super() down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); [acquired] gfs2_fill_super() gfs2_make_fs_rw() init_threads() kthread_run() ( Interrupted ) [Error path] gfs2_gl_hash_clear() flush_workqueue(glock_workqueue) wait_for_completion() [workqueue context] glock_work_func() run_queue() do_xmote() freeze_go_sync() freeze_super() down_write(&sb->s_umount) [deadlock] In freeze_go_sync() there is a gfs2_withdrawn() check that we can use to make sure freeze_super() is not called in the error path, so add a gfs2_withdraw_delayed() call when init_threads() fails. Ref: https://bugzilla.kernel.org/show_bug.cgi?id=212231 Reported-by: Alexander Aring Signed-off-by: Andrew Price Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 97076d3f562f..9e91c9d92bd6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -162,8 +162,10 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) int error; error = init_threads(sdp); - if (error) + if (error) { + gfs2_withdraw_delayed(sdp); return error; + } j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); if (gfs2_withdrawn(sdp)) { -- cgit v1.2.3 From efe814a471e0e58f28f1efaf430c8784a4f36626 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:08 +0000 Subject: io_uring: fix ->flags races by linked timeouts It's racy to modify req->flags from a not owning context, e.g. linked timeout calling req_set_fail_links() for the master request might race with that request setting/clearing flags while being executed concurrently. Just remove req_set_fail_links(prev) from io_link_timeout_fn(), io_async_find_and_cancel() and functions down the line take care of setting the fail bit. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 58d62dd9f8e4..217f72d50ff5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6197,7 +6197,6 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - req_set_fail_links(prev); io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req_deferred(prev, 1); } else { -- cgit v1.2.3 From 180f829fe4026bd192447d261e712b6cb84f6202 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:09 +0000 Subject: io_uring: fix complete_post use ctx after free If io_req_complete_post() put not a final ref, we can't rely on the request's ctx ref, and so ctx may potentially be freed while complete_post() is in io_cqring_ev_posted()/etc. In that case get an additional ctx reference, and put it in the end, so protecting following io_cqring_ev_posted(). And also prolong ctx lifetime until spin_unlock happens, as we do with mutexes, so added percpu_ref_get() doesn't race with ctx free. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 217f72d50ff5..f8a683607b25 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1550,14 +1550,17 @@ static void io_req_complete_post(struct io_kiocb *req, long res, io_put_task(req->task, 1); list_add(&req->compl.list, &cs->locked_free_list); cs->locked_free_nr++; - } else - req = NULL; + } else { + if (!percpu_ref_tryget(&ctx->refs)) + req = NULL; + } io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); - if (req) + if (req) { + io_cqring_ev_posted(ctx); percpu_ref_put(&ctx->refs); + } } static void io_req_complete_state(struct io_kiocb *req, long res, @@ -8373,11 +8376,13 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) { /* * Some may use context even when all refs and requests have been put, - * and they are free to do so while still holding uring_lock, see - * __io_req_task_submit(). Wait for them to finish. + * and they are free to do so while still holding uring_lock or + * completion_lock, see __io_req_task_submit(). Wait for them to finish. */ mutex_lock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock); + spin_lock_irq(&ctx->completion_lock); + spin_unlock_irq(&ctx->completion_lock); io_sq_thread_finish(ctx); io_sqe_buffers_unregister(ctx); -- cgit v1.2.3 From 09a6f4efaa6536e760385f949e24078fd78305ad Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:10 +0000 Subject: io_uring: replace sqd rw_semaphore with mutex The only user of read-locking of sqd->rw_lock is sq_thread itself, which is by definition alone, so we don't really need rw_semaphore, but mutex will do. Replace it with a mutex, and kill read-to-write upgrading and extra task_work handling in io_sq_thread(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index f8a683607b25..6de779aafd33 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -258,7 +258,7 @@ enum { struct io_sq_data { refcount_t refs; - struct rw_semaphore rw_lock; + struct mutex lock; /* ctx's that are using this sqd */ struct list_head ctx_list; @@ -6689,16 +6689,15 @@ static int io_sq_thread(void *data) set_cpus_allowed_ptr(current, cpu_online_mask); current->flags |= PF_NO_SETAFFINITY; - down_read(&sqd->rw_lock); - + mutex_lock(&sqd->lock); while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { int ret; bool cap_entries, sqt_spin, needs_sched; if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { - up_read(&sqd->rw_lock); + mutex_unlock(&sqd->lock); cond_resched(); - down_read(&sqd->rw_lock); + mutex_lock(&sqd->lock); io_run_task_work(); timeout = jiffies + sqd->sq_thread_idle; continue; @@ -6745,10 +6744,10 @@ static int io_sq_thread(void *data) list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); - up_read(&sqd->rw_lock); + mutex_unlock(&sqd->lock); schedule(); try_to_freeze(); - down_read(&sqd->rw_lock); + mutex_lock(&sqd->lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } @@ -6756,20 +6755,13 @@ static int io_sq_thread(void *data) finish_wait(&sqd->wait, &wait); timeout = jiffies + sqd->sq_thread_idle; } - up_read(&sqd->rw_lock); - down_write(&sqd->rw_lock); - /* - * someone may have parked and added a cancellation task_work, run - * it first because we don't want it in io_uring_cancel_sqpoll() - */ - io_run_task_work(); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_uring_cancel_sqpoll(ctx); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); - up_write(&sqd->rw_lock); + mutex_unlock(&sqd->lock); io_run_task_work(); complete(&sqd->exited); @@ -7071,21 +7063,21 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) } static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->rw_lock) + __releases(&sqd->lock) { WARN_ON_ONCE(sqd->thread == current); clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - up_write(&sqd->rw_lock); + mutex_unlock(&sqd->lock); } static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->rw_lock) + __acquires(&sqd->lock) { WARN_ON_ONCE(sqd->thread == current); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - down_write(&sqd->rw_lock); + mutex_lock(&sqd->lock); /* set again for consistency, in case concurrent parks are happening */ set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); if (sqd->thread) @@ -7096,11 +7088,11 @@ static void io_sq_thread_stop(struct io_sq_data *sqd) { WARN_ON_ONCE(sqd->thread == current); - down_write(&sqd->rw_lock); + mutex_lock(&sqd->lock); set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); if (sqd->thread) wake_up_process(sqd->thread); - up_write(&sqd->rw_lock); + mutex_unlock(&sqd->lock); wait_for_completion(&sqd->exited); } @@ -7182,7 +7174,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); - init_rwsem(&sqd->rw_lock); + mutex_init(&sqd->lock); init_waitqueue_head(&sqd->wait); init_completion(&sqd->exited); return sqd; -- cgit v1.2.3 From f6d54255f4235448d4bbe442362d4caa62da97d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:11 +0000 Subject: io_uring: halt SQO submission on ctx exit io_sq_thread_finish() is called in io_ring_ctx_free(), so SQPOLL task is potentially running submitting new requests. It's not a disaster because of using a "try" variant of percpu_ref_get, but is far from nice. Remove ctx from the sqd ctx list earlier, before cancellation loop, so SQPOLL can't find it and so won't submit new requests. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6de779aafd33..b87012a21775 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8564,6 +8564,14 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); + /* prevent SQPOLL from submitting new requests */ + if (ctx->sq_data) { + io_sq_thread_park(ctx->sq_data); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(ctx->sq_data); + io_sq_thread_unpark(ctx->sq_data); + } + io_kill_timeouts(ctx, NULL, NULL); io_poll_remove_all(ctx, NULL, NULL); -- cgit v1.2.3 From 9e138a48345427fa42f6076396ea069cebf3c08f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 14 Mar 2021 20:57:12 +0000 Subject: io_uring: fix concurrent parking If io_sq_thread_park() of one task got rescheduled right after set_bit(), before it gets back to mutex_lock() there can happen park()/unpark() by another task with SQPOLL locking again and continuing running never seeing that first set_bit(SHOULD_PARK), so won't even try to put the mutex down for parking. It will get parked eventually when SQPOLL drops the lock for reschedule, but may be problematic and will get in the way of further fixes. Account number of tasks waiting for parking with a new atomic variable park_pending and adjust SHOULD_PARK accordingly. It doesn't entirely replaces SHOULD_PARK bit with this atomic var because it's convenient to have it as a bit in the state and will help to do optimisations later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index b87012a21775..70ceb8ed5950 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -258,6 +258,7 @@ enum { struct io_sq_data { refcount_t refs; + atomic_t park_pending; struct mutex lock; /* ctx's that are using this sqd */ @@ -7067,7 +7068,13 @@ static void io_sq_thread_unpark(struct io_sq_data *sqd) { WARN_ON_ONCE(sqd->thread == current); + /* + * Do the dance but not conditional clear_bit() because it'd race with + * other threads incrementing park_pending and setting the bit. + */ clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + if (atomic_dec_return(&sqd->park_pending)) + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_unlock(&sqd->lock); } @@ -7076,10 +7083,9 @@ static void io_sq_thread_park(struct io_sq_data *sqd) { WARN_ON_ONCE(sqd->thread == current); + atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_lock(&sqd->lock); - /* set again for consistency, in case concurrent parks are happening */ - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); if (sqd->thread) wake_up_process(sqd->thread); } @@ -7099,6 +7105,8 @@ static void io_sq_thread_stop(struct io_sq_data *sqd) static void io_put_sq_data(struct io_sq_data *sqd) { if (refcount_dec_and_test(&sqd->refs)) { + WARN_ON_ONCE(atomic_read(&sqd->park_pending)); + io_sq_thread_stop(sqd); kfree(sqd); } @@ -7172,6 +7180,7 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, if (!sqd) return ERR_PTR(-ENOMEM); + atomic_set(&sqd->park_pending, 0); refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); mutex_init(&sqd->lock); -- cgit v1.2.3 From 9b46571142e47503ed4f3ae3be5ed3968d8cb9cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Mar 2021 14:23:07 +0000 Subject: io_uring: add generic callback_head helpers We already have helpers to run/add callback_head but taking ctx and working with ctx->exit_task_work. Extract generic versions of them implemented in terms of struct callback_head, it will be used later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 62 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 70ceb8ed5950..5c2de43b99f5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1929,17 +1929,44 @@ static int io_req_task_work_add(struct io_kiocb *req) return ret; } -static void io_req_task_work_add_fallback(struct io_kiocb *req, - task_work_func_t cb) +static bool io_run_task_work_head(struct callback_head **work_head) +{ + struct callback_head *work, *next; + bool executed = false; + + do { + work = xchg(work_head, NULL); + if (!work) + break; + + do { + next = work->next; + work->func(work); + work = next; + cond_resched(); + } while (work); + executed = true; + } while (1); + + return executed; +} + +static void io_task_work_add_head(struct callback_head **work_head, + struct callback_head *task_work) { - struct io_ring_ctx *ctx = req->ctx; struct callback_head *head; - init_task_work(&req->task_work, cb); do { - head = READ_ONCE(ctx->exit_task_work); - req->task_work.next = head; - } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head); + head = READ_ONCE(*work_head); + task_work->next = head; + } while (cmpxchg(work_head, head, task_work) != head); +} + +static void io_req_task_work_add_fallback(struct io_kiocb *req, + task_work_func_t cb) +{ + init_task_work(&req->task_work, cb); + io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work); } static void __io_req_task_cancel(struct io_kiocb *req, int error) @@ -8471,26 +8498,9 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } -static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) +static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx) { - struct callback_head *work, *next; - bool executed = false; - - do { - work = xchg(&ctx->exit_task_work, NULL); - if (!work) - break; - - do { - next = work->next; - work->func(work); - work = next; - cond_resched(); - } while (work); - executed = true; - } while (1); - - return executed; + return io_run_task_work_head(&ctx->exit_task_work); } struct io_tctx_exit { -- cgit v1.2.3 From b7f5a0bfe2061b2c7b2164de06fa4072d7373a45 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Mar 2021 14:23:08 +0000 Subject: io_uring: fix sqpoll cancellation via task_work Running sqpoll cancellations via task_work_run() is a bad idea because it depends on other task works to be run, but those may be locked in currently running task_work_run() because of how it's (splicing the list in batches). Enqueue and run them through a separate callback head, namely struct io_sq_data::park_task_work. As a nice bonus we now precisely control where it's run, that's much safer than guessing where it can happen as it was before. Reported-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5c2de43b99f5..5f4e312111ea 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -274,6 +274,7 @@ struct io_sq_data { unsigned long state; struct completion exited; + struct callback_head *park_task_work; }; #define IO_IOPOLL_BATCH 8 @@ -6727,6 +6728,7 @@ static int io_sq_thread(void *data) cond_resched(); mutex_lock(&sqd->lock); io_run_task_work(); + io_run_task_work_head(&sqd->park_task_work); timeout = jiffies + sqd->sq_thread_idle; continue; } @@ -6781,6 +6783,7 @@ static int io_sq_thread(void *data) } finish_wait(&sqd->wait, &wait); + io_run_task_work_head(&sqd->park_task_work); timeout = jiffies + sqd->sq_thread_idle; } @@ -6792,6 +6795,7 @@ static int io_sq_thread(void *data) mutex_unlock(&sqd->lock); io_run_task_work(); + io_run_task_work_head(&sqd->park_task_work); complete(&sqd->exited); do_exit(0); } @@ -8890,7 +8894,7 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx) if (task) { init_completion(&work.completion); init_task_work(&work.task_work, io_sqpoll_cancel_cb); - WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL)); + io_task_work_add_head(&sqd->park_task_work, &work.task_work); wake_up_process(task); } io_sq_thread_unpark(sqd); -- cgit v1.2.3 From d336f7ebc65007f5831e2297e6f3383ae8dbf8ed Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 2 Mar 2021 09:32:53 -0800 Subject: xfs: force log and push AIL to clear pinned inodes when aborting mount If we allocate quota inodes in the process of mounting a filesystem but then decide to abort the mount, it's possible that the quota inodes are sitting around pinned by the log. Now that inode reclaim relies on the AIL to flush inodes, we have to force the log and push the AIL in between releasing the quota inodes and kicking off reclaim to tear down all the incore inodes. Do this by extracting the bits we need from the unmount path and reusing them. As an added bonus, failed writes during a failed mount will not retry forever now. This was originally found during a fuzz test of metadata directories (xfs/1546), but the actual symptom was that reclaim hung up on the quota inodes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_mount.c | 90 ++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 52370d0a3f43..1c97b155a8ee 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -634,6 +634,47 @@ xfs_check_summary_counts( return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); } +/* + * Flush and reclaim dirty inodes in preparation for unmount. Inodes and + * internal inode structures can be sitting in the CIL and AIL at this point, + * so we need to unpin them, write them back and/or reclaim them before unmount + * can proceed. + * + * An inode cluster that has been freed can have its buffer still pinned in + * memory because the transaction is still sitting in a iclog. The stale inodes + * on that buffer will be pinned to the buffer until the transaction hits the + * disk and the callbacks run. Pushing the AIL will skip the stale inodes and + * may never see the pinned buffer, so nothing will push out the iclog and + * unpin the buffer. + * + * Hence we need to force the log to unpin everything first. However, log + * forces don't wait for the discards they issue to complete, so we have to + * explicitly wait for them to complete here as well. + * + * Then we can tell the world we are unmounting so that error handling knows + * that the filesystem is going away and we should error out anything that we + * have been retrying in the background. This will prevent never-ending + * retries in AIL pushing from hanging the unmount. + * + * Finally, we can push the AIL to clean all the remaining dirty objects, then + * reclaim the remaining inodes that are still in memory at this point in time. + */ +static void +xfs_unmount_flush_inodes( + struct xfs_mount *mp) +{ + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_extent_busy_wait_all(mp); + flush_workqueue(xfs_discard_wq); + + mp->m_flags |= XFS_MOUNT_UNMOUNTING; + + xfs_ail_push_all_sync(mp->m_ail); + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp); + xfs_health_unmount(mp); +} + /* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct @@ -1008,7 +1049,7 @@ xfs_mountfs( /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); /* - * Cancel all delayed reclaim work and reclaim the inodes directly. + * Flush all inode reclamation work and flush the log. * We have to do this /after/ rtunmount and qm_unmount because those * two will have scheduled delayed reclaim for the rt/quota inodes. * @@ -1018,11 +1059,8 @@ xfs_mountfs( * qm_unmount_quotas and therefore rely on qm_unmount to release the * quota inodes. */ - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp); - xfs_health_unmount(mp); + xfs_unmount_flush_inodes(mp); out_log_dealloc: - mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) @@ -1063,47 +1101,7 @@ xfs_unmountfs( xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); - /* - * We can potentially deadlock here if we have an inode cluster - * that has been freed has its buffer still pinned in memory because - * the transaction is still sitting in a iclog. The stale inodes - * on that buffer will be pinned to the buffer until the - * transaction hits the disk and the callbacks run. Pushing the AIL will - * skip the stale inodes and may never see the pinned buffer, so - * nothing will push out the iclog and unpin the buffer. Hence we - * need to force the log here to ensure all items are flushed into the - * AIL before we go any further. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* - * Wait for all busy extents to be freed, including completion of - * any discard operation. - */ - xfs_extent_busy_wait_all(mp); - flush_workqueue(xfs_discard_wq); - - /* - * We now need to tell the world we are unmounting. This will allow - * us to detect that the filesystem is going away and we should error - * out anything that we have been retrying in the background. This will - * prevent neverending retries in AIL pushing from hanging the unmount. - */ - mp->m_flags |= XFS_MOUNT_UNMOUNTING; - - /* - * Flush all pending changes from the AIL. - */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * Reclaim all inodes. At this point there should be no dirty inodes and - * none should be pinned or locked. Stop background inode reclaim here - * if it is still running. - */ - cancel_delayed_work_sync(&mp->m_reclaim_work); - xfs_reclaim_inodes(mp); - xfs_health_unmount(mp); + xfs_unmount_flush_inodes(mp); xfs_qm_unmount(mp); -- cgit v1.2.3 From 8723d5ba8bdae1c41be7a6fc8469dc9aa551e7d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 14 Mar 2021 10:59:39 -0700 Subject: xfs: also reject BULKSTAT_SINGLE in a mount user namespace BULKSTAT_SINGLE exposed the ondisk uids/gids just like bulkstat, and can be called on any inode, including ones not visible in the current mount. Fixes: f736d93d76d3 ("xfs: support idmapped mounts") Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_itable.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ca310a125d1e..3498b97fb06d 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -168,6 +168,12 @@ xfs_bulkstat_one( }; int error; + if (breq->mnt_userns != &init_user_ns) { + xfs_warn_ratelimited(breq->mp, + "bulkstat not supported inside of idmapped mounts."); + return -EINVAL; + } + ASSERT(breq->icount == 1); bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), -- cgit v1.2.3 From d2dcc8ed8ec650a793e81d8b2222146eb6ddd84f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 8 Mar 2021 17:20:17 +0800 Subject: btrfs: fix wrong offset to zero out range beyond i_size [BUG] The test generic/091 fails , with the following output: fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -W mapped writes DISABLED Seed set to 1 main: filesystem does not support fallocate mode FALLOC_FL_COLLAPSE_RANGE, disabling! main: filesystem does not support fallocate mode FALLOC_FL_INSERT_RANGE, disabling! skipping zero size read truncating to largest ever: 0xe400 copying to largest ever: 0x1f400 cloning to largest ever: 0x70000 cloning to largest ever: 0x77000 fallocating to largest ever: 0x7a120 Mapped Read: non-zero data past EOF (0x3a7ff) page offset 0x800 is 0xf2e1 <<< ... [CAUSE] In commit c28ea613fafa ("btrfs: subpage: fix the false data csum mismatch error") end_bio_extent_readpage() changes to only zero the range inside the bvec for incoming subpage support. But that commit is using incorrect offset to calculate the start. For subpage, we can have a case that the whole bvec is beyond isize, thus we need to calculate the correct offset. But the offending commit is using @end (bvec end), other than @start (bvec start) to calculate the start offset. This means, we only zero the last byte of the bvec, not from the isize. This stupid bug makes the range beyond isize is not properly zeroed, and failed above test. [FIX] Use correct @start to calculate the range start. Reported-by: kernel test robot Fixes: c28ea613fafa ("btrfs: subpage: fix the false data csum mismatch error") Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4671c99d468d..f3d7be975c3a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3020,7 +3020,7 @@ readpage_ok: */ if (page->index == end_index && i_size <= end) { u32 zero_start = max(offset_in_page(i_size), - offset_in_page(end)); + offset_in_page(start)); zero_user_segment(page, zero_start, offset_in_page(end) + 1); -- cgit v1.2.3 From fbf48bb0b197e6894a04c714728c952af7153bf3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 3 Mar 2021 18:41:51 +0800 Subject: btrfs: track qgroup released data in own variable in insert_prealloc_file_extent There is a piece of weird code in insert_prealloc_file_extent(), which looks like: ret = btrfs_qgroup_release_data(inode, file_offset, len); if (ret < 0) return ERR_PTR(ret); if (trans) { ret = insert_reserved_file_extent(trans, inode, file_offset, &stack_fi, true, ret); ... } extent_info.is_new_extent = true; extent_info.qgroup_reserved = ret; ... Note how the variable @ret is abused here, and if anyone is adding code just after btrfs_qgroup_release_data() call, it's super easy to overwrite the @ret and cause tons of qgroup related bugs. Fix such abuse by introducing new variable @qgroup_released, so that we won't reuse the existing variable @ret. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c35b724a5611..77182be403c5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9873,6 +9873,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; + int qgroup_released; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); @@ -9885,14 +9886,14 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - ret = btrfs_qgroup_release_data(inode, file_offset, len); - if (ret < 0) - return ERR_PTR(ret); + qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); + if (qgroup_released < 0) + return ERR_PTR(qgroup_released); if (trans) { ret = insert_reserved_file_extent(trans, inode, file_offset, &stack_fi, - true, ret); + true, qgroup_released); if (ret) return ERR_PTR(ret); return trans; @@ -9905,7 +9906,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; - extent_info.qgroup_reserved = ret; + extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; path = btrfs_alloc_path(); -- cgit v1.2.3 From a3ee79bd8fe17812d2305ccc4bf81bfeab395576 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 3 Mar 2021 18:41:52 +0800 Subject: btrfs: fix qgroup data rsv leak caused by falloc failure [BUG] When running fsstress with only falloc workload, and a very low qgroup limit set, we can get qgroup data rsv leak at unmount time. BTRFS warning (device dm-0): qgroup 0/5 has unreleased space, type 0 rsv 20480 BTRFS error (device dm-0): qgroup reserved space leaked The minimal reproducer looks like: #!/bin/bash dev=/dev/test/test mnt="/mnt/btrfs" fsstress=~/xfstests-dev/ltp/fsstress runtime=8 workload() { umount $dev &> /dev/null umount $mnt &> /dev/null mkfs.btrfs -f $dev > /dev/null mount $dev $mnt btrfs quota en $mnt btrfs quota rescan -w $mnt btrfs qgroup limit 16m 0/5 $mnt $fsstress -w -z -f creat=10 -f fallocate=10 -p 2 -n 100 \ -d $mnt -v > /tmp/fsstress umount $mnt if dmesg | grep leak ; then echo "!!! FAILED !!!" exit 1 fi } for (( i=0; i < $runtime; i++)); do echo "=== $i/$runtime===" workload done Normally it would fail before round 4. [CAUSE] In function insert_prealloc_file_extent(), we first call btrfs_qgroup_release_data() to know how many bytes are reserved for qgroup data rsv. Then use that @qgroup_released number to continue our work. But after we call btrfs_qgroup_release_data(), we should either queue @qgroup_released to delayed ref or free them manually in error path. Unfortunately, we lack the error handling to free the released bytes, leaking qgroup data rsv. All the error handling function outside won't help at all, as we have released the range, meaning in inode io tree, the EXTENT_QGROUP_RESERVED bit is already cleared, thus all btrfs_qgroup_free_data() call won't free any data rsv. [FIX] Add free_qgroup tag to manually free the released qgroup data rsv. Reported-by: Nikolay Borisov Reported-by: David Sterba Fixes: 9729f10a608f ("btrfs: inode: move qgroup reserved space release to the callers of insert_reserved_file_extent()") CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 77182be403c5..ea5ede619220 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9895,7 +9895,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( file_offset, &stack_fi, true, qgroup_released); if (ret) - return ERR_PTR(ret); + goto free_qgroup; return trans; } @@ -9910,17 +9910,31 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.insertions = 0; path = btrfs_alloc_path(); - if (!path) - return ERR_PTR(-ENOMEM); + if (!path) { + ret = -ENOMEM; + goto free_qgroup; + } ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); if (ret) - return ERR_PTR(ret); - + goto free_qgroup; return trans; + +free_qgroup: + /* + * We have released qgroup data range at the beginning of the function, + * and normally qgroup_released bytes will be freed when committing + * transaction. + * But if we error out early, we have to free what we have released + * or we leak qgroup data reservation. + */ + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, qgroup_released, + BTRFS_QGROUP_RSV_DATA); + return ERR_PTR(ret); } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, -- cgit v1.2.3 From e3d3b4157610164b0ec43d968b0dfedfe7c68992 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Mar 2021 15:13:30 +0000 Subject: btrfs: zoned: fix linked list corruption after log root tree allocation failure When using a zoned filesystem, while syncing the log, if we fail to allocate the root node for the log root tree, we are not removing the log context we allocated on stack from the list of log contexts of the log root tree. This means after the return from btrfs_sync_log() we get a corrupted linked list. Fix this by allocating the node before adding our stack allocated context to the list of log contexts of the log root tree. Fixes: 3ddebf27fcd3a9 ("btrfs: zoned: reorder log node allocation on zoned filesystem") Reviewed-by: Johannes Thumshirn Reviewed-by: Naohiro Aota Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2f1acc9aea9e..92a368627791 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3169,10 +3169,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); - index2 = log_root_tree->log_transid % 2; - list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); - root_log_ctx.log_transid = log_root_tree->log_transid; - if (btrfs_is_zoned(fs_info)) { if (!log_root_tree->node) { ret = btrfs_alloc_log_tree_node(trans, log_root_tree); @@ -3183,6 +3179,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } } + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit -- cgit v1.2.3 From 64fcbb6158ecc684d84c64424830a9c37c77c5b9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Mar 2021 10:26:45 +0000 Subject: afs: Fix accessing YFS xattrs on a non-YFS server If someone attempts to access YFS-related xattrs (e.g. afs.yfs.acl) on a file on a non-YFS AFS server (such as OpenAFS), then the kernel will jump to a NULL function pointer because the afs_fetch_acl_operation descriptor doesn't point to a function for issuing an operation on a non-YFS server[1]. Fix this by making afs_wait_for_operation() check that the issue_afs_rpc method is set before jumping to it and setting -ENOTSUPP if not. This fix also covers other potential operations that also only exist on YFS servers. afs_xattr_get/set_yfs() then need to translate -ENOTSUPP to -ENODATA as the former error is internal to the kernel. The bug shows up as an oops like the following: BUG: kernel NULL pointer dereference, address: 0000000000000000 [...] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6. [...] Call Trace: afs_wait_for_operation+0x83/0x1b0 [kafs] afs_xattr_get_yfs+0xe6/0x270 [kafs] __vfs_getxattr+0x59/0x80 vfs_getxattr+0x11c/0x140 getxattr+0x181/0x250 ? __check_object_size+0x13f/0x150 ? __fput+0x16d/0x250 __x64_sys_fgetxattr+0x64/0xb0 do_syscall_64+0x49/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fb120a9defe This was triggered with "cp -a" which attempts to copy xattrs, including afs ones, but is easier to reproduce with getfattr, e.g.: getfattr -d -m ".*" /afs/openafs.org/ Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") Reported-by: Gaja Sophie Peters Signed-off-by: David Howells Tested-by: Gaja Sophie Peters Reviewed-by: Marc Dionne Reviewed-by: Jeffrey Altman cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003498.html [1] Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003566.html # v1 Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003572.html # v2 --- fs/afs/fs_operation.c | 7 +++++-- fs/afs/xattr.c | 8 +++++++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 97cab12b0a6c..71c58723763d 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -181,10 +181,13 @@ void afs_wait_for_operation(struct afs_operation *op) if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) && op->ops->issue_yfs_rpc) op->ops->issue_yfs_rpc(op); - else + else if (op->ops->issue_afs_rpc) op->ops->issue_afs_rpc(op); + else + op->ac.error = -ENOTSUPP; - op->error = afs_wait_for_call_to_complete(op->call, &op->ac); + if (op->call) + op->error = afs_wait_for_call_to_complete(op->call, &op->ac); } switch (op->error) { diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index c629caae5002..4934e325a14a 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -231,6 +231,8 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler, else ret = -ERANGE; } + } else if (ret == -ENOTSUPP) { + ret = -ENODATA; } error_yacl: @@ -256,6 +258,7 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler, { struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(inode); + int ret; if (flags == XATTR_CREATE || strcmp(name, "acl") != 0) @@ -270,7 +273,10 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler, return afs_put_operation(op); op->ops = &yfs_store_opaque_acl2_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -ENODATA; + return ret; } static const struct xattr_handler afs_xattr_yfs_handler = { -- cgit v1.2.3 From a7889c6320b9200e3fe415238f546db677310fa9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Mar 2021 08:27:39 +0000 Subject: afs: Stop listxattr() from listing "afs.*" attributes afs_listxattr() lists all the available special afs xattrs (i.e. those in the "afs.*" space), no matter what type of server we're dealing with. But OpenAFS servers, for example, cannot deal with some of the extra-capable attributes that AuriStor (YFS) servers provide. Unfortunately, the presence of the afs.yfs.* attributes causes errors[1] for anything that tries to read them if the server is of the wrong type. Fix the problem by removing afs_listxattr() so that none of the special xattrs are listed (AFS doesn't support xattrs). It does mean, however, that getfattr won't list them, though they can still be accessed with getxattr() and setxattr(). This can be tested with something like: getfattr -d -m ".*" /afs/example.com/path/to/file With this change, none of the afs.* attributes should be visible. Changes: ver #2: - Hide all of the afs.* xattrs, not just the ACL ones. Fixes: ae46578b963f ("afs: Get YFS ACLs and information through xattrs") Reported-by: Gaja Sophie Peters Signed-off-by: David Howells Tested-by: Gaja Sophie Peters Reviewed-by: Jeffrey Altman Reviewed-by: Marc Dionne cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003502.html [1] Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003567.html # v1 Link: http://lists.infradead.org/pipermail/linux-afs/2021-March/003573.html # v2 --- fs/afs/dir.c | 1 - fs/afs/file.c | 1 - fs/afs/inode.c | 1 - fs/afs/internal.h | 1 - fs/afs/mntpt.c | 1 - fs/afs/xattr.c | 23 ----------------------- 6 files changed, 28 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 714fcca9af99..17548c1faf02 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -70,7 +70,6 @@ const struct inode_operations afs_dir_inode_operations = { .permission = afs_permission, .getattr = afs_getattr, .setattr = afs_setattr, - .listxattr = afs_listxattr, }; const struct address_space_operations afs_dir_aops = { diff --git a/fs/afs/file.c b/fs/afs/file.c index 85f5adf21aa0..960b64268623 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -43,7 +43,6 @@ const struct inode_operations afs_file_inode_operations = { .getattr = afs_getattr, .setattr = afs_setattr, .permission = afs_permission, - .listxattr = afs_listxattr, }; const struct address_space_operations afs_fs_aops = { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1156b2df28d3..12be88716e4c 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -27,7 +27,6 @@ static const struct inode_operations afs_symlink_inode_operations = { .get_link = page_get_link, - .listxattr = afs_listxattr, }; static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index b626e38e9ab5..1627b1872812 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1509,7 +1509,6 @@ extern int afs_launder_page(struct page *); * xattr.c */ extern const struct xattr_handler *afs_xattr_handlers[]; -extern ssize_t afs_listxattr(struct dentry *, char *, size_t); /* * yfsclient.c diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 052dab2f5c03..bbb2c210d139 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -32,7 +32,6 @@ const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, .readlink = page_readlink, .getattr = afs_getattr, - .listxattr = afs_listxattr, }; const struct inode_operations afs_autocell_inode_operations = { diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 4934e325a14a..7751b0b3f81d 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -11,29 +11,6 @@ #include #include "internal.h" -static const char afs_xattr_list[] = - "afs.acl\0" - "afs.cell\0" - "afs.fid\0" - "afs.volume\0" - "afs.yfs.acl\0" - "afs.yfs.acl_inherited\0" - "afs.yfs.acl_num_cleaned\0" - "afs.yfs.vol_acl"; - -/* - * Retrieve a list of the supported xattrs. - */ -ssize_t afs_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - if (size == 0) - return sizeof(afs_xattr_list); - if (size < sizeof(afs_xattr_list)) - return -ERANGE; - memcpy(buffer, afs_xattr_list, sizeof(afs_xattr_list)); - return sizeof(afs_xattr_list); -} - /* * Deal with the result of a successful fetch ACL operation. */ -- cgit v1.2.3 From 1601ea068b886da1f8f8d4e18b9403e9e24adef6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 15 Mar 2021 12:43:55 +0900 Subject: zonefs: prevent use of seq files as swap file The sequential write constraint of sequential zone file prevent their use as swap files. Only allow conventional zone files to be used as swap files. Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system") Cc: Reviewed-by: Johannes Thumshirn Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 0fe76f376dee..a3d074f98660 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -165,6 +165,21 @@ static int zonefs_writepages(struct address_space *mapping, return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); } +static int zonefs_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) +{ + struct inode *inode = file_inode(swap_file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { + zonefs_err(inode->i_sb, + "swap file: not a conventional zone file\n"); + return -EINVAL; + } + + return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops); +} + static const struct address_space_operations zonefs_file_aops = { .readpage = zonefs_readpage, .readahead = zonefs_readahead, @@ -177,6 +192,7 @@ static const struct address_space_operations zonefs_file_aops = { .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .direct_IO = noop_direct_IO, + .swap_activate = zonefs_swap_activate, }; static void zonefs_update_stats(struct inode *inode, loff_t new_isize) -- cgit v1.2.3 From ebfd68cd0c1e81267c757332385cb96df30dacce Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Mar 2021 15:20:28 +0900 Subject: zonefs: Fix O_APPEND async write handling zonefs updates the size of a sequential zone file inode only on completion of direct writes. When executing asynchronous append writes (with a file open with O_APPEND or using RWF_APPEND), the use of the current inode size in generic_write_checks() to set an iocb offset thus leads to unaligned write if an application issues an append write operation with another write already being executed. Fix this problem by introducing zonefs_write_checks() as a modified version of generic_write_checks() using the file inode wp_offset for an append write iocb offset. Also introduce zonefs_write_check_limits() to replace generic_write_check_limits() call. This zonefs special helper makes sure that the maximum file limit used is the maximum size of the file being accessed. Since zonefs_write_checks() already truncates the iov_iter, the calls to iov_iter_truncate() in zonefs_file_dio_write() and zonefs_file_buffered_write() are removed. Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system") Cc: Reviewed-by: Johannes Thumshirn Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index a3d074f98660..3427c99abb4d 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -743,6 +743,68 @@ out_release: return ret; } +/* + * Do not exceed the LFS limits nor the file zone size. If pos is under the + * limit it becomes a short access. If it exceeds the limit, return -EFBIG. + */ +static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, + loff_t count) +{ + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + loff_t limit = rlimit(RLIMIT_FSIZE); + loff_t max_size = zi->i_max_size; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + count = min(count, limit - pos); + } + + if (!(file->f_flags & O_LARGEFILE)) + max_size = min_t(loff_t, MAX_NON_LFS, max_size); + + if (unlikely(pos >= max_size)) + return -EFBIG; + + return min(count, max_size - pos); +} + +static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + loff_t count; + + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (iocb->ki_flags & IOCB_APPEND) { + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return -EINVAL; + mutex_lock(&zi->i_truncate_mutex); + iocb->ki_pos = zi->i_wpoffset; + mutex_unlock(&zi->i_truncate_mutex); + } + + count = zonefs_write_check_limits(file, iocb->ki_pos, + iov_iter_count(from)); + if (count < 0) + return count; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} + /* * Handle direct writes. For sequential zone files, this is the only possible * write path. For these files, check that the user is issuing writes @@ -760,8 +822,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) struct super_block *sb = inode->i_sb; bool sync = is_sync_kiocb(iocb); bool append = false; - size_t count; - ssize_t ret; + ssize_t ret, count; /* * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT @@ -779,12 +840,11 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } - ret = generic_write_checks(iocb, from); - if (ret <= 0) + count = zonefs_write_checks(iocb, from); + if (count <= 0) { + ret = count; goto inode_unlock; - - iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos); - count = iov_iter_count(from); + } if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { ret = -EINVAL; @@ -844,12 +904,10 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, inode_lock(inode); } - ret = generic_write_checks(iocb, from); + ret = zonefs_write_checks(iocb, from); if (ret <= 0) goto inode_unlock; - iov_iter_truncate(from, zi->i_max_size - iocb->ki_pos); - ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops); if (ret > 0) iocb->ki_pos += ret; -- cgit v1.2.3 From d9bb77d51e668a1a6d4530c1ea471574d0ce465f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 15 Mar 2021 13:39:14 +0800 Subject: btrfs: subpage: fix wild pointer access during metadata read failure [BUG] When running fstests for btrfs subpage read-write test, it has a very high chance to crash at generic/475 with the following stack: BTRFS warning (device dm-8): direct IO failed ino 510 rw 1,34817 sector 0xcdf0 len 94208 err no 10 Unable to handle kernel paging request at virtual address ffff80001157e7c0 CPU: 2 PID: 687125 Comm: kworker/u12:4 Tainted: G WC 5.12.0-rc2-custom+ #5 Hardware name: Khadas VIM3 (DT) Workqueue: btrfs-endio-meta btrfs_work_helper [btrfs] pc : queued_spin_lock_slowpath+0x1a0/0x390 lr : do_raw_spin_lock+0xc4/0x11c Call trace: queued_spin_lock_slowpath+0x1a0/0x390 _raw_spin_lock+0x68/0x84 btree_readahead_hook+0x38/0xc0 [btrfs] end_bio_extent_readpage+0x504/0x5f4 [btrfs] bio_endio+0x170/0x1a4 end_workqueue_fn+0x3c/0x60 [btrfs] btrfs_work_helper+0x1b0/0x1b4 [btrfs] process_one_work+0x22c/0x430 worker_thread+0x70/0x3a0 kthread+0x13c/0x140 ret_from_fork+0x10/0x30 Code: 910020e0 8b0200c2 f861d884 aa0203e1 (f8246827) [CAUSE] In end_bio_extent_readpage(), if we hit an error during read, we will handle the error differently for data and metadata. For data we queue a repair, while for metadata, we record the error and let the caller choose what to do. But the code is still using page->private to grab extent buffer, which no longer points to extent buffer for subpage metadata pages. Thus this wild pointer access leads to above crash. [FIX] Introduce a helper, find_extent_buffer_readpage(), to grab extent buffer. The difference against find_extent_buffer_nospinlock() is: - Also handles regular sectorsize == PAGE_SIZE case - No extent buffer refs increase/decrease As extent buffer under IO must have non-zero refs, so this is safe Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f3d7be975c3a..0d00a9cd123a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2885,6 +2885,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } +/* + * Find extent buffer for a givne bytenr. + * + * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking + * in endio context. + */ +static struct extent_buffer *find_extent_buffer_readpage( + struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) +{ + struct extent_buffer *eb; + + /* + * For regular sectorsize, we can use page->private to grab extent + * buffer + */ + if (fs_info->sectorsize == PAGE_SIZE) { + ASSERT(PagePrivate(page) && page->private); + return (struct extent_buffer *)page->private; + } + + /* For subpage case, we need to lookup buffer radix tree */ + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + bytenr >> fs_info->sectorsize_bits); + rcu_read_unlock(); + ASSERT(eb); + return eb; +} + /* * after a readpage IO is done, we need to: * clear the uptodate bits on error @@ -2996,7 +3025,7 @@ static void end_bio_extent_readpage(struct bio *bio) } else { struct extent_buffer *eb; - eb = (struct extent_buffer *)page->private; + eb = find_extent_buffer_readpage(fs_info, page, start); set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = mirror; atomic_dec(&eb->io_pages); -- cgit v1.2.3 From 60484cd9d50117017cf53d5310c6cd629600dc69 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 15 Mar 2021 13:39:15 +0800 Subject: btrfs: subpage: make readahead work properly In readahead infrastructure, we are using a lot of hard coded PAGE_SHIFT while we're not doing anything specific to PAGE_SIZE. One of the most affected part is the radix tree operation of btrfs_fs_info::reada_tree. If using PAGE_SHIFT, subpage metadata readahead is broken and does no help reading metadata ahead. Fix the problem by using btrfs_fs_info::sectorsize_bits so that readahead could work for subpage. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reada.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 20fd4aa48a8c..06713a8fe26b 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -209,7 +209,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err) /* find extent */ spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, - eb->start >> PAGE_SHIFT); + eb->start >> fs_info->sectorsize_bits); if (re) re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -240,7 +240,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, zone = NULL; spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_SHIFT, 1); + logical >> fs_info->sectorsize_bits, 1); if (ret == 1 && logical >= zone->start && logical <= zone->end) { kref_get(&zone->refcnt); spin_unlock(&fs_info->reada_lock); @@ -283,13 +283,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)(zone->end >> PAGE_SHIFT), - zone); + (unsigned long)(zone->end >> fs_info->sectorsize_bits), + zone); if (ret == -EEXIST) { kfree(zone); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_SHIFT, 1); + logical >> fs_info->sectorsize_bits, 1); if (ret == 1 && logical >= zone->start && logical <= zone->end) kref_get(&zone->refcnt); else @@ -315,7 +315,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, u64 length; int real_stripes; int nzones = 0; - unsigned long index = logical >> PAGE_SHIFT; + unsigned long index = logical >> fs_info->sectorsize_bits; int dev_replace_is_ongoing; int have_zone = 0; @@ -497,7 +497,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, struct reada_extent *re) { int i; - unsigned long index = re->logical >> PAGE_SHIFT; + unsigned long index = re->logical >> fs_info->sectorsize_bits; spin_lock(&fs_info->reada_lock); if (--re->refcnt) { @@ -538,11 +538,12 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, static void reada_zone_release(struct kref *kref) { struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + struct btrfs_fs_info *fs_info = zone->device->fs_info; - lockdep_assert_held(&zone->device->fs_info->reada_lock); + lockdep_assert_held(&fs_info->reada_lock); radix_tree_delete(&zone->device->reada_zones, - zone->end >> PAGE_SHIFT); + zone->end >> fs_info->sectorsize_bits); kfree(zone); } @@ -593,7 +594,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) { int i; - unsigned long index = zone->end >> PAGE_SHIFT; + unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits; for (i = 0; i < zone->ndevs; ++i) { struct reada_zone *peer; @@ -628,7 +629,7 @@ static int reada_pick_zone(struct btrfs_device *dev) (void **)&zone, index, 1); if (ret == 0) break; - index = (zone->end >> PAGE_SHIFT) + 1; + index = (zone->end >> dev->fs_info->sectorsize_bits) + 1; if (zone->locked) { if (zone->elems > top_locked_elems) { top_locked_elems = zone->elems; @@ -709,7 +710,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev) * plugging to speed things up */ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_SHIFT, 1); + dev->reada_next >> fs_info->sectorsize_bits, 1); if (ret == 0 || re->logical > dev->reada_curr_zone->end) { ret = reada_pick_zone(dev); if (!ret) { @@ -718,7 +719,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev) } re = NULL; ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_SHIFT, 1); + dev->reada_next >> fs_info->sectorsize_bits, 1); } if (ret == 0) { spin_unlock(&fs_info->reada_lock); @@ -885,7 +886,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) pr_cont(" curr off %llu", device->reada_next - zone->start); pr_cont("\n"); - index = (zone->end >> PAGE_SHIFT) + 1; + index = (zone->end >> fs_info->sectorsize_bits) + 1; } cnt = 0; index = 0; @@ -910,7 +911,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } } pr_cont("\n"); - index = (re->logical >> PAGE_SHIFT) + 1; + index = (re->logical >> fs_info->sectorsize_bits) + 1; if (++cnt > 15) break; } @@ -926,7 +927,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) if (ret == 0) break; if (!re->scheduled) { - index = (re->logical >> PAGE_SHIFT) + 1; + index = (re->logical >> fs_info->sectorsize_bits) + 1; continue; } pr_debug("re: logical %llu size %u list empty %d scheduled %d", @@ -942,7 +943,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } } pr_cont("\n"); - index = (re->logical >> PAGE_SHIFT) + 1; + index = (re->logical >> fs_info->sectorsize_bits) + 1; } spin_unlock(&fs_info->reada_lock); } -- cgit v1.2.3 From f8425c9396639cc462bcce44b1051f8b4e62fddb Mon Sep 17 00:00:00 2001 From: Alessio Balsini Date: Mon, 25 Jan 2021 15:30:51 +0000 Subject: fuse: 32-bit user space ioctl compat for fuse device With a 64-bit kernel build the FUSE device cannot handle ioctl requests coming from 32-bit user space. This is due to the ioctl command translation that generates different command identifiers that thus cannot be used for direct comparisons without proper manipulation. Explicitly extract type and number from the ioctl command to enable 32-bit user space compatibility on 64-bit kernel builds. Signed-off-by: Alessio Balsini Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 26 ++++++++++++++++---------- include/uapi/linux/fuse.h | 3 ++- 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c6636b4c4ccf..c0fee830a34e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2229,19 +2229,21 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - int err = -ENOTTY; + int res; + int oldfd; + struct fuse_dev *fud = NULL; - if (cmd == FUSE_DEV_IOC_CLONE) { - int oldfd; + if (_IOC_TYPE(cmd) != FUSE_DEV_IOC_MAGIC) + return -ENOTTY; - err = -EFAULT; - if (!get_user(oldfd, (__u32 __user *) arg)) { + switch (_IOC_NR(cmd)) { + case _IOC_NR(FUSE_DEV_IOC_CLONE): + res = -EFAULT; + if (!get_user(oldfd, (__u32 __user *)arg)) { struct file *old = fget(oldfd); - err = -EINVAL; + res = -EINVAL; if (old) { - struct fuse_dev *fud = NULL; - /* * Check against file->f_op because CUSE * uses the same ioctl handler. @@ -2252,14 +2254,18 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, if (fud) { mutex_lock(&fuse_mutex); - err = fuse_device_clone(fud->fc, file); + res = fuse_device_clone(fud->fc, file); mutex_unlock(&fuse_mutex); } fput(old); } } + break; + default: + res = -ENOTTY; + break; } - return err; + return res; } const struct file_operations fuse_dev_operations = { diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 98ca64d1beb6..54442612c48b 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -903,7 +903,8 @@ struct fuse_notify_retrieve_in { }; /* Device ioctls: */ -#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t) +#define FUSE_DEV_IOC_MAGIC 229 +#define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t) struct fuse_lseek_in { uint64_t fh; -- cgit v1.2.3 From 34e49994d0dcdb2d31d4d2908d04f4e9ce57e4d7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Mar 2021 15:18:24 +0100 Subject: btrfs: fix slab cache flags for free space tree bitmap The free space tree bitmap slab cache is created with SLAB_RED_ZONE but that's a debugging flag and not always enabled. Also the other slabs are created with at least SLAB_MEM_SPREAD that we want as well to average the memory placement cost. Reported-by: Vlastimil Babka Fixes: 3acd48507dc4 ("btrfs: fix allocation of free space cache v1 bitmap pages") CC: stable@vger.kernel.org # 5.4+ Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ea5ede619220..5092dea21448 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9005,7 +9005,7 @@ int __init btrfs_init_cachep(void) btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, - SLAB_RED_ZONE, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_bitmap_cachep) goto fail; -- cgit v1.2.3 From dbcc7d57bffc0c8cac9dac11bec548597d59a6a5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Mar 2021 14:31:05 +0000 Subject: btrfs: fix race when cloning extent buffer during rewind of an old root While resolving backreferences, as part of a logical ino ioctl call or fiemap, we can end up hitting a BUG_ON() when replaying tree mod log operations of a root, triggering a stack trace like the following: ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.c:1210! invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 1 PID: 19054 Comm: crawl_335 Tainted: G W 5.11.0-2d11c0084b02-misc-next+ #89 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 RIP: 0010:__tree_mod_log_rewind+0x3b1/0x3c0 Code: 05 48 8d 74 10 (...) RSP: 0018:ffffc90001eb70b8 EFLAGS: 00010297 RAX: 0000000000000000 RBX: ffff88812344e400 RCX: ffffffffb28933b6 RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff88812344e42c RBP: ffffc90001eb7108 R08: 1ffff11020b60a20 R09: ffffed1020b60a20 R10: ffff888105b050f9 R11: ffffed1020b60a1f R12: 00000000000000ee R13: ffff8880195520c0 R14: ffff8881bc958500 R15: ffff88812344e42c FS: 00007fd1955e8700(0000) GS:ffff8881f5600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007efdb7928718 CR3: 000000010103a006 CR4: 0000000000170ee0 Call Trace: btrfs_search_old_slot+0x265/0x10d0 ? lock_acquired+0xbb/0x600 ? btrfs_search_slot+0x1090/0x1090 ? free_extent_buffer.part.61+0xd7/0x140 ? free_extent_buffer+0x13/0x20 resolve_indirect_refs+0x3e9/0xfc0 ? lock_downgrade+0x3d0/0x3d0 ? __kasan_check_read+0x11/0x20 ? add_prelim_ref.part.11+0x150/0x150 ? lock_downgrade+0x3d0/0x3d0 ? __kasan_check_read+0x11/0x20 ? lock_acquired+0xbb/0x600 ? __kasan_check_write+0x14/0x20 ? do_raw_spin_unlock+0xa8/0x140 ? rb_insert_color+0x30/0x360 ? prelim_ref_insert+0x12d/0x430 find_parent_nodes+0x5c3/0x1830 ? resolve_indirect_refs+0xfc0/0xfc0 ? lock_release+0xc8/0x620 ? fs_reclaim_acquire+0x67/0xf0 ? lock_acquire+0xc7/0x510 ? lock_downgrade+0x3d0/0x3d0 ? lockdep_hardirqs_on_prepare+0x160/0x210 ? lock_release+0xc8/0x620 ? fs_reclaim_acquire+0x67/0xf0 ? lock_acquire+0xc7/0x510 ? poison_range+0x38/0x40 ? unpoison_range+0x14/0x40 ? trace_hardirqs_on+0x55/0x120 btrfs_find_all_roots_safe+0x142/0x1e0 ? find_parent_nodes+0x1830/0x1830 ? btrfs_inode_flags_to_xflags+0x50/0x50 iterate_extent_inodes+0x20e/0x580 ? tree_backref_for_extent+0x230/0x230 ? lock_downgrade+0x3d0/0x3d0 ? read_extent_buffer+0xdd/0x110 ? lock_downgrade+0x3d0/0x3d0 ? __kasan_check_read+0x11/0x20 ? lock_acquired+0xbb/0x600 ? __kasan_check_write+0x14/0x20 ? _raw_spin_unlock+0x22/0x30 ? __kasan_check_write+0x14/0x20 iterate_inodes_from_logical+0x129/0x170 ? iterate_inodes_from_logical+0x129/0x170 ? btrfs_inode_flags_to_xflags+0x50/0x50 ? iterate_extent_inodes+0x580/0x580 ? __vmalloc_node+0x92/0xb0 ? init_data_container+0x34/0xb0 ? init_data_container+0x34/0xb0 ? kvmalloc_node+0x60/0x80 btrfs_ioctl_logical_to_ino+0x158/0x230 btrfs_ioctl+0x205e/0x4040 ? __might_sleep+0x71/0xe0 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? getrusage+0x4b6/0x9c0 ? __kasan_check_read+0x11/0x20 ? lock_release+0xc8/0x620 ? __might_fault+0x64/0xd0 ? lock_acquire+0xc7/0x510 ? lock_downgrade+0x3d0/0x3d0 ? lockdep_hardirqs_on_prepare+0x210/0x210 ? lockdep_hardirqs_on_prepare+0x210/0x210 ? __kasan_check_read+0x11/0x20 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? lock_downgrade+0x3d0/0x3d0 ? lockdep_hardirqs_on_prepare+0x210/0x210 ? __kasan_check_read+0x11/0x20 ? lock_release+0xc8/0x620 ? __task_pid_nr_ns+0xd3/0x250 ? lock_acquire+0xc7/0x510 ? __fget_files+0x160/0x230 ? __fget_light+0xf2/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fd1976e2427 Code: 00 00 90 48 8b 05 (...) RSP: 002b:00007fd1955e5cf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fd1955e5f40 RCX: 00007fd1976e2427 RDX: 00007fd1955e5f48 RSI: 00000000c038943b RDI: 0000000000000004 RBP: 0000000001000000 R08: 0000000000000000 R09: 00007fd1955e6120 R10: 0000557835366b00 R11: 0000000000000246 R12: 0000000000000004 R13: 00007fd1955e5f48 R14: 00007fd1955e5f40 R15: 00007fd1955e5ef8 Modules linked in: ---[ end trace ec8931a1c36e57be ]--- (gdb) l *(__tree_mod_log_rewind+0x3b1) 0xffffffff81893521 is in __tree_mod_log_rewind (fs/btrfs/ctree.c:1210). 1205 * the modification. as we're going backwards, we do the 1206 * opposite of each operation here. 1207 */ 1208 switch (tm->op) { 1209 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1210 BUG_ON(tm->slot < n); 1211 fallthrough; 1212 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1213 case MOD_LOG_KEY_REMOVE: 1214 btrfs_set_node_key(eb, &tm->key, tm->slot); Here's what happens to hit that BUG_ON(): 1) We have one tree mod log user (through fiemap or the logical ino ioctl), with a sequence number of 1, so we have fs_info->tree_mod_seq == 1; 2) Another task is at ctree.c:balance_level() and we have eb X currently as the root of the tree, and we promote its single child, eb Y, as the new root. Then, at ctree.c:balance_level(), we call: tree_mod_log_insert_root(eb X, eb Y, 1); 3) At tree_mod_log_insert_root() we create tree mod log elements for each slot of eb X, of operation type MOD_LOG_KEY_REMOVE_WHILE_FREEING each with a ->logical pointing to ebX->start. These are placed in an array named tm_list. Lets assume there are N elements (N pointers in eb X); 4) Then, still at tree_mod_log_insert_root(), we create a tree mod log element of operation type MOD_LOG_ROOT_REPLACE, ->logical set to ebY->start, ->old_root.logical set to ebX->start, ->old_root.level set to the level of eb X and ->generation set to the generation of eb X; 5) Then tree_mod_log_insert_root() calls tree_mod_log_free_eb() with tm_list as argument. After that, tree_mod_log_free_eb() calls __tree_mod_log_insert() for each member of tm_list in reverse order, from highest slot in eb X, slot N - 1, to slot 0 of eb X; 6) __tree_mod_log_insert() sets the sequence number of each given tree mod log operation - it increments fs_info->tree_mod_seq and sets fs_info->tree_mod_seq as the sequence number of the given tree mod log operation. This means that for the tm_list created at tree_mod_log_insert_root(), the element corresponding to slot 0 of eb X has the highest sequence number (1 + N), and the element corresponding to the last slot has the lowest sequence number (2); 7) Then, after inserting tm_list's elements into the tree mod log rbtree, the MOD_LOG_ROOT_REPLACE element is inserted, which gets the highest sequence number, which is N + 2; 8) Back to ctree.c:balance_level(), we free eb X by calling btrfs_free_tree_block() on it. Because eb X was created in the current transaction, has no other references and writeback did not happen for it, we add it back to the free space cache/tree; 9) Later some other task T allocates the metadata extent from eb X, since it is marked as free space in the space cache/tree, and uses it as a node for some other btree; 10) The tree mod log user task calls btrfs_search_old_slot(), which calls get_old_root(), and finally that calls __tree_mod_log_oldest_root() with time_seq == 1 and eb_root == eb Y; 11) First iteration of the while loop finds the tree mod log element with sequence number N + 2, for the logical address of eb Y and of type MOD_LOG_ROOT_REPLACE; 12) Because the operation type is MOD_LOG_ROOT_REPLACE, we don't break out of the loop, and set root_logical to point to tm->old_root.logical which corresponds to the logical address of eb X; 13) On the next iteration of the while loop, the call to tree_mod_log_search_oldest() returns the smallest tree mod log element for the logical address of eb X, which has a sequence number of 2, an operation type of MOD_LOG_KEY_REMOVE_WHILE_FREEING and corresponds to the old slot N - 1 of eb X (eb X had N items in it before being freed); 14) We then break out of the while loop and return the tree mod log operation of type MOD_LOG_ROOT_REPLACE (eb Y), and not the one for slot N - 1 of eb X, to get_old_root(); 15) At get_old_root(), we process the MOD_LOG_ROOT_REPLACE operation and set "logical" to the logical address of eb X, which was the old root. We then call tree_mod_log_search() passing it the logical address of eb X and time_seq == 1; 16) Then before calling tree_mod_log_search(), task T adds a key to eb X, which results in adding a tree mod log operation of type MOD_LOG_KEY_ADD to the tree mod log - this is done at ctree.c:insert_ptr() - but after adding the tree mod log operation and before updating the number of items in eb X from 0 to 1... 17) The task at get_old_root() calls tree_mod_log_search() and gets the tree mod log operation of type MOD_LOG_KEY_ADD just added by task T. Then it enters the following if branch: if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { (...) } (...) Calls read_tree_block() for eb X, which gets a reference on eb X but does not lock it - task T has it locked. Then it clones eb X while it has nritems set to 0 in its header, before task T sets nritems to 1 in eb X's header. From hereupon we use the clone of eb X which no other task has access to; 18) Then we call __tree_mod_log_rewind(), passing it the MOD_LOG_KEY_ADD mod log operation we just got from tree_mod_log_search() in the previous step and the cloned version of eb X; 19) At __tree_mod_log_rewind(), we set the local variable "n" to the number of items set in eb X's clone, which is 0. Then we enter the while loop, and in its first iteration we process the MOD_LOG_KEY_ADD operation, which just decrements "n" from 0 to (u32)-1, since "n" is declared with a type of u32. At the end of this iteration we call rb_next() to find the next tree mod log operation for eb X, that gives us the mod log operation of type MOD_LOG_KEY_REMOVE_WHILE_FREEING, for slot 0, with a sequence number of N + 1 (steps 3 to 6); 20) Then we go back to the top of the while loop and trigger the following BUG_ON(): (...) switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); fallthrough; (...) Because "n" has a value of (u32)-1 (4294967295) and tm->slot is 0. Fix this by taking a read lock on the extent buffer before cloning it at ctree.c:get_old_root(). This should be done regardless of the extent buffer having been freed and reused, as a concurrent task might be modifying it (while holding a write lock on it). Reported-by: Zygo Blaxell Link: https://lore.kernel.org/linux-btrfs/20210227155037.GN28049@hungrycats.org/ Fixes: 834328a8493079 ("Btrfs: tree mod log's old roots could still be part of the tree") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d56730a67885..34b929bd5c1a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1365,7 +1365,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) "failed to read tree block %llu from get_old_root", logical); } else { + btrfs_tree_read_lock(old); eb = btrfs_clone_extent_buffer(old); + btrfs_tree_read_unlock(old); free_extent_buffer(old); } } else if (old_root) { -- cgit v1.2.3 From 485df75554257e883d0ce39bb886e8212349748e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Mar 2021 14:31:06 +0000 Subject: btrfs: always pin deleted leaves when there are active tree mod log users When freeing a tree block we may end up adding its extent back to the free space cache/tree, as long as there are no more references for it, it was created in the current transaction and writeback for it never happened. This is generally fine, however when we have tree mod log operations it can result in inconsistent versions of a btree after unwinding extent buffers with the recorded tree mod log operations. This is because: * We only log operations for nodes (adding and removing key/pointers), for leaves we don't do anything; * This means that we can log a MOD_LOG_KEY_REMOVE_WHILE_FREEING operation for a node that points to a leaf that was deleted; * Before we apply the logged operation to unwind a node, we can have that leaf's extent allocated again, either as a node or as a leaf, and possibly for another btree. This is possible if the leaf was created in the current transaction and writeback for it never started, in which case btrfs_free_tree_block() returns its extent back to the free space cache/tree; * Then, before applying the tree mod log operation, some task allocates the metadata extent just freed before, and uses it either as a leaf or as a node for some btree (can be the same or another one, it does not matter); * After applying the MOD_LOG_KEY_REMOVE_WHILE_FREEING operation we now get the target node with an item pointing to the metadata extent that now has content different from what it had before the leaf was deleted. It might now belong to a different btree and be a node and not a leaf anymore. As a consequence, the results of searches after the unwinding can be unpredictable and produce unexpected results. So make sure we pin extent buffers corresponding to leaves when there are tree mod log users. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 78ad31a59e59..36a3c973fda1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3323,6 +3323,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (last_ref && btrfs_header_generation(buf) == trans->transid) { struct btrfs_block_group *cache; + bool must_pin = false; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); @@ -3340,7 +3341,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } - if (btrfs_is_zoned(fs_info)) { + /* + * If this is a leaf and there are tree mod log users, we may + * have recorded mod log operations that point to this leaf. + * So we must make sure no one reuses this leaf's extent before + * mod log operations are applied to a node, otherwise after + * rewinding a node using the mod log operations we get an + * inconsistent btree, as the leaf's extent may now be used as + * a node or leaf for another different btree. + * We are safe from races here because at this point no other + * node or root points to this extent buffer, so if after this + * check a new tree mod log user joins, it will not be able to + * find a node pointing to this leaf and record operations that + * point to this leaf. + */ + if (btrfs_header_level(buf) == 0) { + read_lock(&fs_info->tree_mod_log_lock); + must_pin = !list_empty(&fs_info->tree_mod_seq_list); + read_unlock(&fs_info->tree_mod_log_lock); + } + + if (must_pin || btrfs_is_zoned(fs_info)) { btrfs_redirty_list_add(trans->transaction, buf); pin_down_extent(trans, cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); -- cgit v1.2.3 From 5abbe51a526253b9f003e9a0a195638dc882d660 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Feb 2021 18:46:41 +0100 Subject: kernel, fs: Introduce and use set_restart_fn() and arch_set_restart_data() Preparation for fixing get_nr_restart_syscall() on X86 for COMPAT. Add a new helper which sets restart_block->fn and calls a dummy arch_set_restart_data() helper. Fixes: 609c19a385c8 ("x86/ptrace: Stop setting TS_COMPAT in ptrace code") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210201174641.GA17871@redhat.com --- fs/select.c | 10 ++++------ include/linux/thread_info.h | 13 +++++++++++++ kernel/futex.c | 3 +-- kernel/time/alarmtimer.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 2 +- 6 files changed, 21 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index 37aaa8317f3a..945896d0ac9e 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1055,10 +1055,9 @@ static long do_restart_poll(struct restart_block *restart_block) ret = do_sys_poll(ufds, nfds, to); - if (ret == -ERESTARTNOHAND) { - restart_block->fn = do_restart_poll; - ret = -ERESTART_RESTARTBLOCK; - } + if (ret == -ERESTARTNOHAND) + ret = set_restart_fn(restart_block, do_restart_poll); + return ret; } @@ -1080,7 +1079,6 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, struct restart_block *restart_block; restart_block = ¤t->restart_block; - restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; @@ -1091,7 +1089,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, } else restart_block->poll.has_timeout = 0; - ret = -ERESTART_RESTARTBLOCK; + ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 9b2158c69275..157762db9d4b 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -59,6 +60,18 @@ enum syscall_work_bit { #ifdef __KERNEL__ +#ifndef arch_set_restart_data +#define arch_set_restart_data(restart) do { } while (0) +#endif + +static inline long set_restart_fn(struct restart_block *restart, + long (*fn)(struct restart_block *)) +{ + restart->fn = fn; + arch_set_restart_data(restart); + return -ERESTART_RESTARTBLOCK; +} + #ifndef THREAD_ALIGN #define THREAD_ALIGN THREAD_SIZE #endif diff --git a/kernel/futex.c b/kernel/futex.c index e68db7745039..00febd6dea9c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2728,14 +2728,13 @@ retry: goto out; restart = ¤t->restart_block; - restart->fn = futex_wait_restart; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; - ret = -ERESTART_RESTARTBLOCK; + ret = set_restart_fn(restart, futex_wait_restart); out: if (to) { diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 98d7a15e8cf6..4d94e2b5499d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -854,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, if (flags == TIMER_ABSTIME) return -ERESTARTNOHAND; - restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp; + set_restart_fn(restart, alarm_timer_nsleep_restart); return ret; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 788b9d137de4..5c9d968187ae 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1957,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, } restart = ¤t->restart_block; - restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a71758e34e45..9abe15255bc4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; - restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; + set_restart_fn(restart_block, posix_cpu_nsleep_restart); } return error; } -- cgit v1.2.3 From 6980d29ce4da223ad7f0751c7f1d61d3c6b54ab3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 16 Mar 2021 20:30:26 +0800 Subject: zonefs: fix to update .i_wr_refcnt correctly in zonefs_open_zone() In zonefs_open_zone(), if opened zone count is larger than .s_max_open_zones threshold, we missed to recover .i_wr_refcnt, fix this. Fixes: b5c00e975779 ("zonefs: open/close zone on file open/close") Cc: Signed-off-by: Chao Yu Signed-off-by: Damien Le Moal --- fs/zonefs/super.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3427c99abb4d..049e36c69ed7 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1040,9 +1040,7 @@ static int zonefs_open_zone(struct inode *inode) mutex_lock(&zi->i_truncate_mutex); - zi->i_wr_refcnt++; - if (zi->i_wr_refcnt == 1) { - + if (!zi->i_wr_refcnt) { if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { atomic_dec(&sbi->s_open_zones); ret = -EBUSY; @@ -1052,7 +1050,6 @@ static int zonefs_open_zone(struct inode *inode) if (i_size_read(inode) < zi->i_max_size) { ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); if (ret) { - zi->i_wr_refcnt--; atomic_dec(&sbi->s_open_zones); goto unlock; } @@ -1060,6 +1057,8 @@ static int zonefs_open_zone(struct inode *inode) } } + zi->i_wr_refcnt++; + unlock: mutex_unlock(&zi->i_truncate_mutex); -- cgit v1.2.3 From f3da882eae2d6ba6c72062a46344ba096a7c2b3d Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 17 Mar 2021 17:57:31 +0900 Subject: btrfs: zoned: remove outdated WARN_ON in direct IO In btrfs_submit_direct() there's a WAN_ON_ONCE() that will trigger if we're submitting a DIO write on a zoned filesystem but are not using REQ_OP_ZONE_APPEND to submit the IO to the block device. This is a left over from a previous version where btrfs_dio_iomap_begin() didn't use btrfs_use_zone_append() to check for sequential write only zones. It is an oversight from the development phase. In v11 (I think) I've added 08f455593fff ("btrfs: zoned: cache if block group is on a sequential zone") and forgot to remove the WARN_ON_ONCE() for 544d24f9de73 ("btrfs: zoned: enable zone append writing for direct IO"). When developing auto relocation I got hit by the WARN as a block groups where relocated to conventional zone and the dio code calls btrfs_use_zone_append() introduced by 08f455593fff to check if it can use zone append (a.k.a. if it's a sequential zone) or not and sets the appropriate flags for iomap. I've never hit it in testing before, as I was relying on emulation to test the conventional zones code but this one case wasn't hit, because on emulation fs_info->max_zone_append_size is 0 and the WARN doesn't trigger either. Fixes: 544d24f9de73 ("btrfs: zoned: enable zone append writing for direct IO") Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5092dea21448..4120fd96a134 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8166,10 +8166,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; - WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) && - fs_info->max_zone_append_size && - bio_op(bio) != REQ_OP_ZONE_APPEND); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); -- cgit v1.2.3 From 82d62d06db404d03836cdabbca41d38646d97cbb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Mar 2021 11:23:15 -0500 Subject: btrfs: do not initialize dev stats if we have no dev_root Neal reported a panic trying to use -o rescue=all BUG: kernel NULL pointer dereference, address: 0000000000000030 PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 4095 Comm: mount Not tainted 5.11.0-0.rc7.149.fc34.x86_64 #1 RIP: 0010:btrfs_device_init_dev_stats+0x4c/0x1f0 RSP: 0018:ffffa60285fbfb68 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88b88f806498 RCX: ffff88b82e7a2a10 RDX: ffffa60285fbfb97 RSI: ffff88b82e7a2a10 RDI: 0000000000000000 RBP: ffff88b88f806b3c R08: 0000000000000000 R09: 0000000000000000 R10: ffff88b82e7a2a10 R11: 0000000000000000 R12: ffff88b88f806a00 R13: ffff88b88f806478 R14: ffff88b88f806a00 R15: ffff88b82e7a2a10 FS: 00007f698be1ec40(0000) GS:ffff88b937e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 0000000092c9c006 CR4: 00000000003706f0 Call Trace: ? btrfs_init_dev_stats+0x1f/0xf0 btrfs_init_dev_stats+0x62/0xf0 open_ctree+0x1019/0x15ff btrfs_mount_root.cold+0x13/0xfa legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x131/0x3d0 ? legacy_get_tree+0x27/0x40 ? btrfs_show_options+0x640/0x640 legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 path_mount+0x441/0xa80 __x64_sys_mount+0xf4/0x130 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f698c04e52e This happens because we unconditionally attempt to initialize device stats on mount, but we may not have been able to read the device root. Fix this by skipping initializing the device stats if we do not have a device root. Reported-by: Neal Gompa CC: stable@vger.kernel.org # 5.11+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b8fab44394f5..e2df6e6e2c18 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7448,6 +7448,9 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device, int item_size; int i, ret, slot; + if (!device->fs_info->dev_root) + return 0; + key.objectid = BTRFS_DEV_STATS_OBJECTID; key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; -- cgit v1.2.3 From 820a49dafc3304de06f296c35c9ff1ebc1666343 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Mar 2021 11:23:14 -0500 Subject: btrfs: initialize device::fs_info always Neal reported a panic trying to use -o rescue=all BUG: kernel NULL pointer dereference, address: 0000000000000030 PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 0 PID: 696 Comm: mount Tainted: G W 5.12.0-rc2+ #296 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 RIP: 0010:btrfs_device_init_dev_stats+0x1d/0x200 RSP: 0018:ffffafaec1483bb8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff9a5715bcb298 RCX: 0000000000000070 RDX: ffff9a5703248000 RSI: ffff9a57052ea150 RDI: ffff9a5715bca400 RBP: ffff9a57052ea150 R08: 0000000000000070 R09: ffff9a57052ea150 R10: 000130faf0741c10 R11: 0000000000000000 R12: ffff9a5703700000 R13: 0000000000000000 R14: ffff9a5715bcb278 R15: ffff9a57052ea150 FS: 00007f600d122c40(0000) GS:ffff9a577bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 0000000112a46005 CR4: 0000000000370ef0 Call Trace: ? btrfs_init_dev_stats+0x1f/0xf0 ? kmem_cache_alloc+0xef/0x1f0 btrfs_init_dev_stats+0x5f/0xf0 open_ctree+0x10cb/0x1720 btrfs_mount_root.cold+0x12/0xea legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x10d/0x380 legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 path_mount+0x433/0xa00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae This happens because when we call btrfs_init_dev_stats we do device->fs_info->dev_root. However device->fs_info isn't initialized because we were only calling btrfs_init_devices_late() if we properly read the device root. However we don't actually need the device root to init the devices, this function simply assigns the devices their ->fs_info pointer properly, so this needs to be done unconditionally always so that we can properly dereference device->fs_info in rescue cases. Reported-by: Neal Gompa CC: stable@vger.kernel.org # 5.11+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c2576c5fe62e..98c46cde7d79 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2387,8 +2387,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; - btrfs_init_devices_late(fs_info); } + /* Initialize fs_info for all devices in any case */ + btrfs_init_devices_late(fs_info); /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { -- cgit v1.2.3 From 3cb894972f1809aa8d087c42e5e8b26c64b7d508 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 Mar 2021 11:23:16 -0500 Subject: btrfs: do not initialize dev replace for bad dev root While helping Neal fix his broken file system I added a debug patch to catch if we were calling btrfs_search_slot with a NULL root, and this stack trace popped: we tried to search with a NULL root CPU: 0 PID: 1760 Comm: mount Not tainted 5.11.0-155.nealbtrfstest.1.fc34.x86_64 #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/22/2020 Call Trace: dump_stack+0x6b/0x83 btrfs_search_slot.cold+0x11/0x1b ? btrfs_init_dev_replace+0x36/0x450 btrfs_init_dev_replace+0x71/0x450 open_ctree+0x1054/0x1610 btrfs_mount_root.cold+0x13/0xfa legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x131/0x3d0 ? legacy_get_tree+0x27/0x40 ? btrfs_show_options+0x640/0x640 legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 path_mount+0x441/0xa80 __x64_sys_mount+0xf4/0x130 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f644730352e Fix this by not starting the device replace stuff if we do not have a NULL dev root. Reported-by: Neal Gompa CC: stable@vger.kernel.org # 5.11+ Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 3a9c1e046ebe..d05f73530af7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -81,6 +81,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace_item *ptr; u64 src_devid; + if (!dev_root) + return 0; + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; -- cgit v1.2.3 From ebd99a6b34fbcccf21067b66d1718000feb80ce8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 17 Mar 2021 11:43:13 +0100 Subject: btrfs: fix build when using M=fs/btrfs There are people building the module with M= that's supposed to be used for external modules. This got broken in e9aa7c285d20 ("btrfs: enable W=1 checks for btrfs"). $ make M=fs/btrfs scripts/Makefile.lib:10: *** Recursive variable 'KBUILD_CFLAGS' references itself (eventually). Stop. make: *** [Makefile:1755: modules] Error 2 There's a difference compared to 'make fs/btrfs/btrfs.ko' which needs to rebuild a few more things and also the dependency modules need to be available. It could fail with eg. WARNING: Symbol version dump "Module.symvers" is missing. Modules may not have dependencies or modversions. In some environments it's more convenient to rebuild just the btrfs module by M= so let's make it work. The problem is with recursive variable evaluation in += so the conditional C options are stored in a temporary variable to avoid the recursion. Signed-off-by: David Sterba --- fs/btrfs/Makefile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index b634c42115ea..b4fb997eda16 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,10 +7,12 @@ subdir-ccflags-y += -Wmissing-format-attribute subdir-ccflags-y += -Wmissing-prototypes subdir-ccflags-y += -Wold-style-definition subdir-ccflags-y += -Wmissing-include-dirs -subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable) -subdir-ccflags-y += $(call cc-option, -Wunused-const-variable) -subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned) -subdir-ccflags-y += $(call cc-option, -Wstringop-truncation) +condflags := \ + $(call cc-option, -Wunused-but-set-variable) \ + $(call cc-option, -Wunused-const-variable) \ + $(call cc-option, -Wpacked-not-aligned) \ + $(call cc-option, -Wstringop-truncation) +subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers subdir-ccflags-y += -Wno-sign-compare -- cgit v1.2.3 From 8d488a8c7ba22d7112fbf6b0a82beb1cdea1c0d5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 16 Mar 2021 16:53:46 +0000 Subject: btrfs: fix subvolume/snapshot deletion not triggered on mount During the mount procedure we are calling btrfs_orphan_cleanup() against the root tree, which will find all orphans items in this tree. When an orphan item corresponds to a deleted subvolume/snapshot (instead of an inode space cache), it must not delete the orphan item, because that will cause btrfs_find_orphan_roots() to not find the orphan item and therefore not add the corresponding subvolume root to the list of dead roots, which results in the subvolume's tree never being deleted by the cleanup thread. The same applies to the remount from RO to RW path. Fix this by making btrfs_find_orphan_roots() run before calling btrfs_orphan_cleanup() against the root tree. A test case for fstests will follow soon. Reported-by: Robbie Ko Link: https://lore.kernel.org/linux-btrfs/b19f4310-35e0-606e-1eea-2dd84d28c5da@synology.com/ Fixes: 638331fa56caea ("btrfs: fix transaction leak and crash after cleaning up orphans on RO mount") CC: stable@vger.kernel.org # 5.11+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 98c46cde7d79..48e9cf01abc5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3010,6 +3010,21 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) } } + /* + * btrfs_find_orphan_roots() is responsible for finding all the dead + * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load + * them into the fs_info->fs_roots_radix tree. This must be done before + * calling btrfs_orphan_cleanup() on the tree root. If we don't do it + * first, then btrfs_orphan_cleanup() will delete a dead root's orphan + * item before the root's tree is deleted - this means that if we unmount + * or crash before the deletion completes, on the next mount we will not + * delete what remains of the tree because the orphan item does not + * exists anymore, which is what tells us we have a pending deletion. + */ + ret = btrfs_find_orphan_roots(fs_info); + if (ret) + goto out; + ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto out; @@ -3069,7 +3084,6 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) } } - ret = btrfs_find_orphan_roots(fs_info); out: return ret; } -- cgit v1.2.3 From 76cd979f4f38a27df22efb5773a0d567181a9392 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 16 Mar 2021 16:33:27 +0100 Subject: io_uring: imply MSG_NOSIGNAL for send[msg]()/recv[msg]() calls We never want to generate any SIGPIPE, -EPIPE only is much better. Signed-off-by: Stefan Metzmacher Link: https://lore.kernel.org/r/38961085c3ec49fd21550c7788f214d1ff02d2d4.1615908477.git.metze@samba.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5f4e312111ea..a81f7a30ea70 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4384,7 +4384,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - flags = req->sr_msg.msg_flags; + flags = req->sr_msg.msg_flags | MSG_NOSIGNAL; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; else if (issue_flags & IO_URING_F_NONBLOCK) @@ -4428,7 +4428,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_controllen = 0; msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; + flags = req->sr_msg.msg_flags | MSG_NOSIGNAL; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; else if (issue_flags & IO_URING_F_NONBLOCK) @@ -4618,7 +4618,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 1, req->sr_msg.len); } - flags = req->sr_msg.msg_flags; + flags = req->sr_msg.msg_flags | MSG_NOSIGNAL; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; else if (force_nonblock) @@ -4677,7 +4677,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) msg.msg_iocb = NULL; msg.msg_flags = 0; - flags = req->sr_msg.msg_flags; + flags = req->sr_msg.msg_flags | MSG_NOSIGNAL; if (flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; else if (force_nonblock) -- cgit v1.2.3 From 53e043b2b432ef2294efec04dd8a88d96c024624 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 15 Mar 2021 12:56:56 +0100 Subject: io_uring: remove structures from include/linux/io_uring.h Link: https://lore.kernel.org/r/8c1d14f3748105f4caeda01716d47af2fa41d11c.1615809009.git.metze@samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io-wq.h | 10 +++++++++- fs/io_uring.c | 16 ++++++++++++++++ include/linux/io_uring.h | 25 ------------------------- 3 files changed, 25 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.h b/fs/io-wq.h index 1ac2f3248088..80d590564ff9 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -2,7 +2,6 @@ #define INTERNAL_IO_WQ_H #include -#include struct io_wq; @@ -21,6 +20,15 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; +struct io_wq_work_node { + struct io_wq_work_node *next; +}; + +struct io_wq_work_list { + struct io_wq_work_node *first; + struct io_wq_work_node *last; +}; + static inline void wq_list_add_after(struct io_wq_work_node *node, struct io_wq_work_node *pos, struct io_wq_work_list *list) diff --git a/fs/io_uring.c b/fs/io_uring.c index a81f7a30ea70..52ba8d7f3eb8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -456,6 +456,22 @@ struct io_ring_ctx { struct list_head tctx_list; }; +struct io_uring_task { + /* submission side */ + struct xarray xa; + struct wait_queue_head wait; + void *last; + void *io_wq; + struct percpu_counter inflight; + atomic_t in_idle; + bool sqpoll; + + spinlock_t task_lock; + struct io_wq_work_list task_list; + unsigned long task_state; + struct callback_head task_work; +}; + /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 9761a0ec9f95..79cde9906be0 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,31 +5,6 @@ #include #include -struct io_wq_work_node { - struct io_wq_work_node *next; -}; - -struct io_wq_work_list { - struct io_wq_work_node *first; - struct io_wq_work_node *last; -}; - -struct io_uring_task { - /* submission side */ - struct xarray xa; - struct wait_queue_head wait; - void *last; - void *io_wq; - struct percpu_counter inflight; - atomic_t in_idle; - bool sqpoll; - - spinlock_t task_lock; - struct io_wq_work_list task_list; - unsigned long task_state; - struct callback_head task_work; -}; - #if defined(CONFIG_IO_URING) struct sock *io_uring_get_socket(struct file *file); void __io_uring_task_cancel(void); -- cgit v1.2.3 From ee53fb2b197b72b126ca0387ae636da75d969428 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 15 Mar 2021 12:56:57 +0100 Subject: io_uring: use typesafe pointers in io_uring_task Signed-off-by: Stefan Metzmacher Link: https://lore.kernel.org/r/ce2a598e66e48347bb04afbaf2acc67c0cc7971a.1615809009.git.metze@samba.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 52ba8d7f3eb8..675073966760 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -460,8 +460,8 @@ struct io_uring_task { /* submission side */ struct xarray xa; struct wait_queue_head wait; - void *last; - void *io_wq; + const struct io_ring_ctx *last; + struct io_wq *io_wq; struct percpu_counter inflight; atomic_t in_idle; bool sqpoll; -- cgit v1.2.3 From de75a3d3f5a14c9ab3c4883de3471d3c92a8ee78 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 18 Mar 2021 11:54:35 +0000 Subject: io_uring: don't leak creds on SQO attach error Attaching to already dead/dying SQPOLL task is disallowed in io_sq_offload_create(), but cleanup is hand coded by calling io_put_sq_data()/etc., that miss to put ctx->sq_creds. Defer everything to error-path io_sq_thread_finish(), adding ctx->sqd_list in the error case as well as finish will handle it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 675073966760..ea2d3e120555 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7910,22 +7910,17 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ret = 0; io_sq_thread_park(sqd); + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); /* don't attach to a dying SQPOLL thread, would be racy */ - if (attached && !sqd->thread) { + if (attached && !sqd->thread) ret = -ENXIO; - } else { - list_add(&ctx->sqd_list, &sqd->ctx_list); - io_sqd_update_thread_idle(sqd); - } io_sq_thread_unpark(sqd); - if (ret < 0) { - io_put_sq_data(sqd); - ctx->sq_data = NULL; - return ret; - } else if (attached) { + if (ret < 0) + goto err; + if (attached) return 0; - } if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; -- cgit v1.2.3 From 0bb788300990d3eb5582d3301a720f846c78925c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 18 Mar 2021 11:22:05 +0000 Subject: btrfs: fix sleep while in non-sleep context during qgroup removal While removing a qgroup's sysfs entry we end up taking the kernfs_mutex, through kobject_del(), while holding the fs_info->qgroup_lock spinlock, producing the following trace: [821.843637] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:281 [821.843641] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 28214, name: podman [821.843644] CPU: 3 PID: 28214 Comm: podman Tainted: G W 5.11.6 #15 [821.843646] Hardware name: Dell Inc. PowerEdge R330/084XW4, BIOS 2.11.0 12/08/2020 [821.843647] Call Trace: [821.843650] dump_stack+0xa1/0xfb [821.843656] ___might_sleep+0x144/0x160 [821.843659] mutex_lock+0x17/0x40 [821.843662] kernfs_remove_by_name_ns+0x1f/0x80 [821.843666] sysfs_remove_group+0x7d/0xe0 [821.843668] sysfs_remove_groups+0x28/0x40 [821.843670] kobject_del+0x2a/0x80 [821.843672] btrfs_sysfs_del_one_qgroup+0x2b/0x40 [btrfs] [821.843685] __del_qgroup_rb+0x12/0x150 [btrfs] [821.843696] btrfs_remove_qgroup+0x288/0x2a0 [btrfs] [821.843707] btrfs_ioctl+0x3129/0x36a0 [btrfs] [821.843717] ? __mod_lruvec_page_state+0x5e/0xb0 [821.843719] ? page_add_new_anon_rmap+0xbc/0x150 [821.843723] ? kfree+0x1b4/0x300 [821.843725] ? mntput_no_expire+0x55/0x330 [821.843728] __x64_sys_ioctl+0x5a/0xa0 [821.843731] do_syscall_64+0x33/0x70 [821.843733] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [821.843736] RIP: 0033:0x4cd3fb [821.843741] RSP: 002b:000000c000906b20 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 [821.843744] RAX: ffffffffffffffda RBX: 000000c000050000 RCX: 00000000004cd3fb [821.843745] RDX: 000000c000906b98 RSI: 000000004010942a RDI: 000000000000000f [821.843747] RBP: 000000c000907cd0 R08: 000000c000622901 R09: 0000000000000000 [821.843748] R10: 000000c000d992c0 R11: 0000000000000206 R12: 000000000000012d [821.843749] R13: 000000000000012c R14: 0000000000000200 R15: 0000000000000049 Fix this by removing the qgroup sysfs entry while not holding the spinlock, since the spinlock is only meant for protection of the qgroup rbtree. Reported-by: Stuart Shelton Link: https://lore.kernel.org/linux-btrfs/7A5485BB-0628-419D-A4D3-27B1AF47E25A@gmail.com/ Fixes: 49e5fb46211de0 ("btrfs: qgroup: export qgroups in sysfs") CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 14ff388fd3bd..f0b9ef13153a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -226,7 +226,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, { struct btrfs_qgroup_list *list; - btrfs_sysfs_del_one_qgroup(fs_info, qgroup); list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, @@ -243,7 +242,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, list_del(&list->next_member); kfree(list); } - kfree(qgroup); } /* must be called with qgroup_lock held */ @@ -569,6 +567,8 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); __del_qgroup_rb(fs_info, qgroup); + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kfree(qgroup); } /* * We call btrfs_free_qgroup_config() when unmounting @@ -1578,6 +1578,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); + + /* + * Remove the qgroup from sysfs now without holding the qgroup_lock + * spinlock, since the sysfs_remove_group() function needs to take + * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). + */ + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kfree(qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; -- cgit v1.2.3 From c1d6abdac46ca8127274bea195d804e3f2cec7ee Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 16 Mar 2021 12:43:00 -0700 Subject: btrfs: fix check_data_csum() error message for direct I/O Commit 1dae796aabf6 ("btrfs: inode: sink parameter start and len to check_data_csum()") replaced the start parameter to check_data_csum() with page_offset(), but page_offset() is not meaningful for direct I/O pages. Bring back the start parameter. Fixes: 265d4ac03fdf ("btrfs: sink parameter start and len to check_data_csum") CC: stable@vger.kernel.org # 5.11+ Reviewed-by: Qu Wenruo Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4120fd96a134..a7be4c9ec835 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3099,11 +3099,13 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page + * @start: logical offset in the file * * The length of such check is always one sector size. */ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, - u32 bio_offset, struct page *page, u32 pgoff) + u32 bio_offset, struct page *page, u32 pgoff, + u64 start) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -3130,8 +3132,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, kunmap_atomic(kaddr); return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff, - csum, csum_expected, io_bio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, + io_bio->mirror_num); if (io_bio->device) btrfs_dev_stat_inc_and_print(io_bio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -3184,7 +3186,8 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, pg_off += sectorsize, bio_offset += sectorsize) { int ret; - ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off); + ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, + page_offset(page) + pg_off); if (ret < 0) return -EIO; } @@ -7907,7 +7910,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, ASSERT(pgoff < PAGE_SIZE); if (uptodate && (!csum || !check_data_csum(inode, io_bio, - bio_offset, bvec.bv_page, pgoff))) { + bio_offset, bvec.bv_page, + pgoff, start))) { clean_io_failure(fs_info, failure_tree, io_tree, start, bvec.bv_page, btrfs_ino(BTRFS_I(inode)), -- cgit v1.2.3 From 403dba003d17b3f0c1627b355cec2d74041cf648 Mon Sep 17 00:00:00 2001 From: Liu xuzhi Date: Thu, 18 Mar 2021 17:46:57 -0700 Subject: fs/cifs/: fix misspellings using codespell tool A typo is found out by codespell tool in 251th lines of cifs_swn.c: $ codespell ./fs/cifs/ ./cifs_swn.c:251: funciton ==> function Fix a typo found by codespell. Signed-off-by: Liu xuzhi Signed-off-by: Steve French --- fs/cifs/cifs_swn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index f2d730fffccb..d829b8bf833e 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -248,7 +248,7 @@ nlmsg_fail: /* * Try to find a matching registration for the tcon's server name and share name. - * Calls to this funciton must be protected by cifs_swnreg_idr_mutex. + * Calls to this function must be protected by cifs_swnreg_idr_mutex. * TODO Try to avoid memory allocations */ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) -- cgit v1.2.3 From af3ef3b1031634724a3763606695ebcd113d782b Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 18 Mar 2021 19:17:10 +0100 Subject: cifs: warn and fail if trying to use rootfs without the config option If CONFIG_CIFS_ROOT is not set, rootfs mount option is invalid Signed-off-by: Aurelien Aptel CC: # v5.11 Signed-off-by: Steve French --- fs/cifs/fs_context.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 892f51a21278..78889024a7ed 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -1196,9 +1196,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, pr_warn_once("Witness protocol support is experimental\n"); break; case Opt_rootfs: -#ifdef CONFIG_CIFS_ROOT - ctx->rootfs = true; +#ifndef CONFIG_CIFS_ROOT + cifs_dbg(VFS, "rootfs support requires CONFIG_CIFS_ROOT config option\n"); + goto cifs_parse_mount_err; #endif + ctx->rootfs = true; break; case Opt_posixpaths: if (result.negated) -- cgit v1.2.3 From 65af8f0166f4d15e61c63db498ec7981acdd897f Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 19 Mar 2021 00:05:48 -0500 Subject: cifs: fix allocation size on newly created files Applications that create and extend and write to a file do not expect to see 0 allocation size. When file is extended, set its allocation size to a plausible value until we have a chance to query the server for it. When the file is cached this will prevent showing an impossible number of allocated blocks (like 0). This fixes e.g. xfstests 614 which does 1) create a file and set its size to 64K 2) mmap write 64K to the file 3) stat -c %b for the file (to query the number of allocated blocks) It was failing because we returned 0 blocks. Even though we would return the correct cached file size, we returned an impossible allocation size. Signed-off-by: Steve French CC: Reviewed-by: Aurelien Aptel --- fs/cifs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7c61bc9573c0..f2df4422e54a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2395,7 +2395,7 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, * We need to be sure that all dirty pages are written and the server * has actual ctime, mtime and file length. */ - if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE)) && + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE | STATX_BLOCKS)) && !CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = filemap_fdatawait(inode->i_mapping); @@ -2585,6 +2585,14 @@ set_size_out: if (rc == 0) { cifsInode->server_eof = attrs->ia_size; cifs_setsize(inode, attrs->ia_size); + /* + * i_blocks is not related to (i_size / i_blksize), but instead + * 512 byte (2**9) size is required for calculating num blocks. + * Until we can query the server for actual allocation size, + * this is best estimate we have for blocks allocated for a file + * Number of blocks must be rounded up so size 1 is not 0 blocks + */ + inode->i_blocks = (512 - 1 + attrs->ia_size) >> 9; /* * The man page of truncate says if the size changed, -- cgit v1.2.3 From b7ff91fd030dc9d72ed91b1aab36e445a003af4f Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 3 Mar 2021 21:17:02 +0800 Subject: ext4: find old entry again if failed to rename whiteout If we failed to add new entry on rename whiteout, we cannot reset the old->de entry directly, because the old->de could have moved from under us during make indexed dir. So find the old entry again before reset is needed, otherwise it may corrupt the filesystem as below. /dev/sda: Entry '00000001' in ??? (12) has deleted/unused inode 15. CLEARED. /dev/sda: Unattached inode 75 /dev/sda: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. Fixes: 6b4b8e6b4ad ("ext4: fix bug for rename with RENAME_WHITEOUT") Cc: stable@vger.kernel.org Signed-off-by: zhangyi (F) Link: https://lore.kernel.org/r/20210303131703.330415-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 686bf982c84e..4aae59d55288 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3613,6 +3613,31 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, return retval; } +static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + struct ext4_renament old = *ent; + int retval = 0; + + /* + * old->de could have moved from under us during make indexed dir, + * so the old->de may no longer valid and need to find it again + * before reset old inode info. + */ + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); + if (IS_ERR(old.bh)) + retval = PTR_ERR(old.bh); + if (!old.bh) + retval = -ENOENT; + if (retval) { + ext4_std_error(old.dir->i_sb, retval); + return; + } + + ext4_setent(handle, &old, ino, file_type); + brelse(old.bh); +} + static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, const struct qstr *d_name) { @@ -3937,8 +3962,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, end_rename: if (whiteout) { if (retval) { - ext4_setent(handle, &old, - old.inode->i_ino, old_file_type); + ext4_resetent(handle, &old, + old.inode->i_ino, old_file_type); drop_nlink(whiteout); } unlock_new_inode(whiteout); -- cgit v1.2.3 From 5dccdc5a1916d4266edd251f20bbbb113a5c495f Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 3 Mar 2021 21:17:03 +0800 Subject: ext4: do not iput inode under running transaction in ext4_rename() In ext4_rename(), when RENAME_WHITEOUT failed to add new entry into directory, it ends up dropping new created whiteout inode under the running transaction. After commit <9b88f9fb0d2> ("ext4: Do not iput inode under running transaction"), we follow the assumptions that evict() does not get called from a transaction context but in ext4_rename() it breaks this suggestion. Although it's not a real problem, better to obey it, so this patch add inode to orphan list and stop transaction before final iput(). Signed-off-by: zhangyi (F) Link: https://lore.kernel.org/r/20210303131703.330415-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 4aae59d55288..1fe8370dbf0b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3799,14 +3799,14 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) - goto end_rename; + goto release_bh; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; - goto end_rename; + goto release_bh; } if (new.bh) { if (!new.inode) { @@ -3823,15 +3823,13 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); if (IS_ERR(handle)) { retval = PTR_ERR(handle); - handle = NULL; - goto end_rename; + goto release_bh; } } else { whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); - whiteout = NULL; - goto end_rename; + goto release_bh; } } @@ -3965,16 +3963,18 @@ end_rename: ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); + ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); + ext4_journal_stop(handle); iput(whiteout); - + } else { + ext4_journal_stop(handle); } +release_bh: brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); - if (handle) - ext4_journal_stop(handle); return retval; } -- cgit v1.2.3 From 6b22489911b726eebbf169caee52fea52013fbdd Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Fri, 5 Mar 2021 20:05:08 +0800 Subject: ext4: do not try to set xattr into ea_inode if value is empty Syzbot report a warning that ext4 may create an empty ea_inode if set an empty extent attribute to a file on the file system which is no free blocks left. WARNING: CPU: 6 PID: 10667 at fs/ext4/xattr.c:1640 ext4_xattr_set_entry+0x10f8/0x1114 fs/ext4/xattr.c:1640 ... Call trace: ext4_xattr_set_entry+0x10f8/0x1114 fs/ext4/xattr.c:1640 ext4_xattr_block_set+0x1d0/0x1b1c fs/ext4/xattr.c:1942 ext4_xattr_set_handle+0x8a0/0xf1c fs/ext4/xattr.c:2390 ext4_xattr_set+0x120/0x1f0 fs/ext4/xattr.c:2491 ext4_xattr_trusted_set+0x48/0x5c fs/ext4/xattr_trusted.c:37 __vfs_setxattr+0x208/0x23c fs/xattr.c:177 ... Now, ext4 try to store extent attribute into an external inode if ext4_xattr_block_set() return -ENOSPC, but for the case of store an empty extent attribute, store the extent entry into the extent attribute block is enough. A simple reproduce below. fallocate test.img -l 1M mkfs.ext4 -F -b 2048 -O ea_inode test.img mount test.img /mnt dd if=/dev/zero of=/mnt/foo bs=2048 count=500 setfattr -n "user.test" /mnt/foo Reported-by: syzbot+98b881fdd8ebf45ab4ae@syzkaller.appspotmail.com Fixes: 9c6e7853c531 ("ext4: reserve space for xattr entries/names") Cc: stable@kernel.org Signed-off-by: zhangyi (F) Link: https://lore.kernel.org/r/20210305120508.298465-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 083c95126781..6c1018223c54 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2404,7 +2404,7 @@ retry_inode: * external inode if possible. */ if (ext4_has_feature_ea_inode(inode->i_sb) && - !i.in_inode) { + i.value_len && !i.in_inode) { i.in_inode = 1; goto retry_inode; } -- cgit v1.2.3 From 7d8bd3c76da1d94b85e6c9b7007e20e980bfcfe6 Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Fri, 12 Mar 2021 01:50:51 -0500 Subject: ext4: fix potential error in ext4_do_update_inode If set_large_file = 1 and errors occur in ext4_handle_dirty_metadata(), the error code will be overridden, go to out_brelse to avoid this situation. Signed-off-by: Shijie Luo Link: https://lore.kernel.org/r/20210312065051.36314-1-luoshijie1@huawei.com Cc: stable@kernel.org Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a79a9ea58c56..927e47db7f00 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5026,7 +5026,7 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; - int err = 0, rc, block; + int err = 0, block; int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; @@ -5138,9 +5138,9 @@ static int ext4_do_update_inode(handle_t *handle, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, NULL, bh); - if (!err) - err = rc; + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out_brelse; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); -- cgit v1.2.3 From 2a4ae3bcdf05b8639406eaa09a2939f3c6dd8e75 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 15 Mar 2021 17:59:06 +0100 Subject: ext4: fix timer use-after-free on failed mount When filesystem mount fails because of corrupted filesystem we first cancel the s_err_report timer reminding fs errors every day and only then we flush s_error_work. However s_error_work may report another fs error and re-arm timer thus resulting in timer use-after-free. Fix the problem by first flushing the work and only after that canceling the s_err_report timer. Reported-by: syzbot+628472a2aac693ab0fcd@syzkaller.appspotmail.com Fixes: 2d01ddc86606 ("ext4: save error info to sb through journal if available") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210315165906.2175-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a0a256859662..b9693680463a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5154,8 +5154,8 @@ failed_mount_wq: failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: - del_timer_sync(&sbi->s_err_report); flush_work(&sbi->s_error_work); + del_timer_sync(&sbi->s_err_report); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: -- cgit v1.2.3 From 8210bb29c1b66200cff7b25febcf6e39baf49fbf Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Tue, 16 Mar 2021 15:19:21 -0700 Subject: ext4: fix rename whiteout with fast commit This patch adds rename whiteout support in fast commits. Note that the whiteout object that gets created is actually char device. Which imples, the function ext4_inode_journal_mode(struct inode *inode) would return "JOURNAL_DATA" for this inode. This has a consequence in fast commit code that it will make creation of the whiteout object a fast-commit ineligible behavior and thus will fall back to full commits. With this patch, this can be observed by running fast commits with rename whiteout and seeing the stats generated by ext4_fc_stats tracepoint as follows: ext4_fc_stats: dev 254:32 fc ineligible reasons: XATTR:0, CROSS_RENAME:0, JOURNAL_FLAG_CHANGE:0, NO_MEM:0, SWAP_BOOT:0, RESIZE:0, RENAME_DIR:0, FALLOC_RANGE:0, INODE_JOURNAL_DATA:16; num_commits:6, ineligible: 6, numblks: 3 So in short, this patch guarantees that in case of rename whiteout, we fall back to full commits. Amir mentioned that instead of creating a new whiteout object for every rename, we can create a static whiteout object with irrelevant nlink. That will make fast commits to not fall back to full commit. But until this happens, this patch will ensure correctness by falling back to full commits. Fixes: 8016e29f4362 ("ext4: fast commit recovery path") Cc: stable@kernel.org Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20210316221921.1124955-1-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/fast_commit.c | 9 +++++++-- fs/ext4/namei.c | 3 +++ 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ea3b41579f38..826a56e3bbd2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2794,6 +2794,8 @@ void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); +void __ext4_fc_track_create(handle_t *handle, struct inode *inode, + struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6c4f19b0a556..7541d0b5d706 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -513,10 +513,10 @@ void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) __ext4_fc_track_link(handle, d_inode(dentry), dentry); } -void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) +void __ext4_fc_track_create(handle_t *handle, struct inode *inode, + struct dentry *dentry) { struct __track_dentry_update_args args; - struct inode *inode = d_inode(dentry); int ret; args.dentry = dentry; @@ -527,6 +527,11 @@ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) trace_ext4_fc_track_create(inode, dentry, ret); } +void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) +{ + __ext4_fc_track_create(handle, d_inode(dentry), dentry); +} + /* __track_fn for inode tracking */ static int __track_inode(struct inode *inode, void *arg, bool update) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1fe8370dbf0b..883e2a7cd4ab 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3873,6 +3873,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, retval = ext4_mark_inode_dirty(handle, whiteout); if (unlikely(retval)) goto end_rename; + } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); @@ -3946,6 +3947,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, ext4_fc_track_unlink(handle, new.dentry); __ext4_fc_track_link(handle, old.inode, new.dentry); __ext4_fc_track_unlink(handle, old.inode, old.dentry); + if (whiteout) + __ext4_fc_track_create(handle, whiteout, old.dentry); } if (new.inode) { -- cgit v1.2.3 From 512c15ef05d73a04f1aef18a3bc61a8bb516f323 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sun, 17 Jan 2021 00:57:32 -0800 Subject: ext4: stop inode update before return The inode update should be stopped before returing the error code. Signed-off-by: Pan Bian Link: https://lore.kernel.org/r/20210117085732.93788-1-bianpan2016@163.com Fixes: 8016e29f4362 ("ext4: fast commit recovery path") Cc: stable@kernel.org Reviewed-by: Harshad Shirwadkar Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 927e47db7f00..0948a43f1b3d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5387,8 +5387,10 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode->i_gid = attr->ia_gid; error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); - if (unlikely(error)) + if (unlikely(error)) { + ext4_fc_stop_update(inode); return error; + } } if (attr->ia_valid & ATTR_SIZE) { -- cgit v1.2.3 From 64395d950bc476106b39341e42ebfd4d2eb71d2c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 21 Mar 2021 00:45:37 -0400 Subject: ext4: initialize ret to suppress smatch warning Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 77c7c8a54da7..77c84d6f1af6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4382,7 +4382,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, { struct inode *inode = file_inode(file); handle_t *handle; - int ret, ret2 = 0, ret3 = 0; + int ret = 0, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; -- cgit v1.2.3 From 00ddff431a458bbf143ea7c4c42d022676da1b17 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 21 Mar 2021 07:06:56 -0600 Subject: io-wq: ensure task is running before processing task_work Mark the current task as running if we need to run task_work from the io-wq threads as part of work handling. If that is the case, then return as such so that the caller can appropriately loop back and reset if it was part of a going-to-sleep flush. Fixes: 3bfe6106693b ("io-wq: fork worker threads from original task") Signed-off-by: Jens Axboe --- fs/io-wq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index e05f996d088f..3dc10bfd8c3b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -386,13 +386,16 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) return NULL; } -static void io_flush_signals(void) +static bool io_flush_signals(void) { if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) { + __set_current_state(TASK_RUNNING); if (current->task_works) task_work_run(); clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL); + return true; } + return false; } static void io_assign_current_work(struct io_worker *worker, @@ -499,7 +502,8 @@ loop: } __io_worker_idle(wqe, worker); raw_spin_unlock_irq(&wqe->lock); - io_flush_signals(); + if (io_flush_signals()) + continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); if (try_to_freeze() || ret) continue; -- cgit v1.2.3 From 0031275d119efe16711cd93519b595e6f9b4b330 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Sat, 20 Mar 2021 20:33:36 +0100 Subject: io_uring: call req_set_fail_links() on short send[msg]()/recv[msg]() with MSG_WAITALL Without that it's not safe to use them in a linked combination with others. Now combinations like IORING_OP_SENDMSG followed by IORING_OP_SPLICE should be possible. We already handle short reads and writes for the following opcodes: - IORING_OP_READV - IORING_OP_READ_FIXED - IORING_OP_READ - IORING_OP_WRITEV - IORING_OP_WRITE_FIXED - IORING_OP_WRITE - IORING_OP_SPLICE - IORING_OP_TEE Now we have it for these as well: - IORING_OP_SENDMSG - IORING_OP_SEND - IORING_OP_RECVMSG - IORING_OP_RECV For IORING_OP_RECVMSG we also check for the MSG_TRUNC and MSG_CTRUNC flags in order to call req_set_fail_links(). There might be applications arround depending on the behavior that even short send[msg]()/recv[msg]() retuns continue an IOSQE_IO_LINK chain. It's very unlikely that such applications pass in MSG_WAITALL, which is only defined in 'man 2 recvmsg', but not in 'man 2 sendmsg'. It's expected that the low level sock_sendmsg() call just ignores MSG_WAITALL, as MSG_ZEROCOPY is also ignored without explicitly set SO_ZEROCOPY. We also expect the caller to know about the implicit truncation to MAX_RW_COUNT, which we don't detect. cc: netdev@vger.kernel.org Link: https://lore.kernel.org/r/c4e1a4cc0d905314f4d5dc567e65a7b09621aab3.1615908477.git.metze@samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ea2d3e120555..543551d70327 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4386,6 +4386,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned flags; + int min_ret = 0; int ret; sock = sock_from_file(req->file); @@ -4406,6 +4407,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) else if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) return io_setup_async_msg(req, kmsg); @@ -4416,7 +4420,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) + if (ret < min_ret) req_set_fail_links(req); __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -4429,6 +4433,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) struct iovec iov; struct socket *sock; unsigned flags; + int min_ret = 0; int ret; sock = sock_from_file(req->file); @@ -4450,6 +4455,9 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) else if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + msg.msg_flags = flags; ret = sock_sendmsg(sock, &msg); if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) @@ -4457,7 +4465,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) if (ret == -ERESTARTSYS) ret = -EINTR; - if (ret < 0) + if (ret < min_ret) req_set_fail_links(req); __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -4609,6 +4617,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; struct io_buffer *kbuf; unsigned flags; + int min_ret = 0; int ret, cflags = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -4640,6 +4649,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) else if (force_nonblock) flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); if (force_nonblock && ret == -EAGAIN) @@ -4653,7 +4665,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) + if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) req_set_fail_links(req); __io_req_complete(req, issue_flags, ret, cflags); return 0; @@ -4668,6 +4680,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; struct iovec iov; unsigned flags; + int min_ret = 0; int ret, cflags = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -4699,6 +4712,9 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) else if (force_nonblock) flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + ret = sock_recvmsg(sock, &msg, flags); if (force_nonblock && ret == -EAGAIN) return -EAGAIN; @@ -4707,7 +4723,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) out_free: if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_recv_kbuf(req); - if (ret < 0) + if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) req_set_fail_links(req); __io_req_complete(req, issue_flags, ret, cflags); return 0; -- cgit v1.2.3 From 0b8cfa974dfc964e6382c9e25fa6c1bdac6ef499 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 21 Mar 2021 14:16:08 -0600 Subject: io_uring: don't use {test,clear}_tsk_thread_flag() for current Linus correctly points out that this is both unnecessary and generates much worse code on some archs as going from current to thread_info is actually backwards - and obviously just wasteful, since the thread_info is what we care about. Since io_uring only operates on current for these operations, just use test_thread_flag() instead. For io-wq, we can further simplify and use tracehook_notify_signal() to handle the TIF_NOTIFY_SIGNAL work and clear the flag. The latter isn't an actual bug right now, but it may very well be in the future if we place other work items under TIF_NOTIFY_SIGNAL. Reported-by: Linus Torvalds Link: https://lore.kernel.org/io-uring/CAHk-=wgYhNck33YHKZ14mFB5MzTTk8gqXHcfj=RWTAXKwgQJgg@mail.gmail.com/ Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 ++---- fs/io_uring.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 3dc10bfd8c3b..2dd43bdb9c7b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -388,11 +388,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) static bool io_flush_signals(void) { - if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) { + if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { __set_current_state(TASK_RUNNING); - if (current->task_works) - task_work_run(); - clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL); + tracehook_notify_signal(); return true; } return false; diff --git a/fs/io_uring.c b/fs/io_uring.c index 543551d70327..be04bc6b5b99 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6873,7 +6873,7 @@ static int io_run_task_work_sig(void) return 1; if (!signal_pending(current)) return 0; - if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL)) + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) return -ERESTARTSYS; return -EINTR; } -- cgit v1.2.3 From d07f1e8a42614cc938c9c88866d4474a5a7fee31 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 22 Mar 2021 01:45:58 +0000 Subject: io_uring: correct io_queue_async_work() traces Request's io-wq work is hashed in io_prep_async_link(), so as trace_io_uring_queue_async_work() looks at it should follow after prep has been done. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/709c9f872f4d2e198c7aed9c49019ca7095dd24d.1616366969.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index be04bc6b5b99..50acd6dfd966 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1239,10 +1239,10 @@ static void io_queue_async_work(struct io_kiocb *req) BUG_ON(!tctx); BUG_ON(!tctx->io_wq); - trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, - &req->work, req->flags); /* init ->work of the whole link before punting */ io_prep_async_link(req); + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, + &req->work, req->flags); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); -- cgit v1.2.3 From b65c128f963df367a8adcfb08f5ecf8721052723 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 22 Mar 2021 01:45:59 +0000 Subject: io_uring: don't skip file_end_write() on reissue Don't miss to call kiocb_end_write() from __io_complete_rw() on reissue. Shouldn't be much of a problem as the function actually does some work only for ISREG, and NONBLOCK won't be reissued. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/32af9b77c5b874e1bee1a3c46396094bd969e577.1616366969.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 50acd6dfd966..a664b44a5a94 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2524,13 +2524,12 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, { int cflags = 0; + if (req->rw.kiocb.ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req)) return; if (res != req->result) req_set_fail_links(req); - - if (req->rw.kiocb.ki_flags & IOCB_WRITE) - kiocb_end_write(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); __io_req_complete(req, issue_flags, res, cflags); -- cgit v1.2.3 From d81269fecb8ce16eb07efafc9ff5520b2a31c486 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 19 Mar 2021 10:21:19 +0000 Subject: io_uring: fix provide_buffers sign extension io_provide_buffers_prep()'s "p->len * p->nbufs" to sign extension problems. Not a huge problem as it's only used for access_ok() and increases the checked length, but better to keep typing right. Reported-by: Colin Ian King Fixes: efe68c1ca8f49 ("io_uring: validate the full range of provided buffers for access") Signed-off-by: Pavel Begunkov Reviewed-by: Colin Ian King Link: https://lore.kernel.org/r/562376a39509e260d8532186a06226e56eb1f594.1616149233.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index a664b44a5a94..f3ae83a2d7bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3977,6 +3977,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) static int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + unsigned long size; struct io_provide_buf *p = &req->pbuf; u64 tmp; @@ -3990,7 +3991,8 @@ static int io_provide_buffers_prep(struct io_kiocb *req, p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); - if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs))) + size = (unsigned long)p->len * p->nbufs; + if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; p->bgid = READ_ONCE(sqe->buf_group); -- cgit v1.2.3 From 5116784039f0421e9a619023cfba3e302c3d9adc Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Tue, 23 Mar 2021 16:52:19 +0800 Subject: block: clear GD_NEED_PART_SCAN later in bdev_disk_changed The GD_NEED_PART_SCAN is set by bdev_check_media_change to initiate a partition scan while removing a block device. It should be cleared after blk_drop_paritions because blk_drop_paritions could return -EBUSY and then the consequence __blkdev_get has no chance to do delete_partition if GD_NEED_PART_SCAN already cleared. It causes some problems on some card readers. Ex. Realtek card reader 0bda:0328 and 0bda:0158. The device node of the partition will not disappear after the memory card removed. Thus the user applications can not update the device mapping correctly. BugLink: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1920874 Signed-off-by: Chris Chiu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210323085219.24428-1-chris.chiu@canonical.com Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 92ed7d5df677..28d583fcdc2c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1240,13 +1240,13 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); - clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - rescan: ret = blk_drop_partitions(bdev); if (ret) return ret; + clear_bit(GD_NEED_PART_SCAN, &disk->state); + /* * Historically we only set the capacity to zero for devices that * support partitions (independ of actually having partitions created). -- cgit v1.2.3 From 39f985c8f667c80a3d1eb19d31138032fa36b09e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 20 Mar 2021 05:40:38 +0000 Subject: fs/cachefiles: Remove wait_bit_key layout dependency Cachefiles was relying on wait_page_key and wait_bit_key being the same layout, which is fragile. Now that wait_page_key is exposed in the pagemap.h header, we can remove that fragility A comment on the need to maintain structure layout equivalence was added by Linus[1] and that is no longer applicable. Fixes: 62906027091f ("mm: add PageWaiters indicating tasks are waiting for a page bit") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: David Howells Tested-by: kafs-testing@auristor.com cc: linux-cachefs@redhat.com cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20210320054104.1300774-2-willy@infradead.org/ Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3510ca20ece0150af6b10c77a74ff1b5c198e3e2 [1] --- fs/cachefiles/rdwr.c | 7 +++---- include/linux/pagemap.h | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index e027c718ca01..8ffc40e84a59 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -24,17 +24,16 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, container_of(wait, struct cachefiles_one_read, monitor); struct cachefiles_object *object; struct fscache_retrieval *op = monitor->op; - struct wait_bit_key *key = _key; + struct wait_page_key *key = _key; struct page *page = wait->private; ASSERT(key); _enter("{%lu},%u,%d,{%p,%u}", monitor->netfs_page->index, mode, sync, - key->flags, key->bit_nr); + key->page, key->bit_nr); - if (key->flags != &page->flags || - key->bit_nr != PG_locked) + if (key->page != page || key->bit_nr != PG_locked) return 0; _debug("--- monitor %p %lx ---", page, page->flags); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 20225b067583..8f4daac6eb4b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -559,7 +559,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, return pgoff; } -/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ struct wait_page_key { struct page *page; int bit_nr; -- cgit v1.2.3 From 75b69799610c2b909a18e709c402923ea61aedc0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 20 Mar 2021 05:40:40 +0000 Subject: afs: Use wait_on_page_writeback_killable Open-coding this function meant it missed out on the recent bugfix for waiters being woken by a delayed wake event from a previous instantiation of the page[1]. [DH: Changed the patch to use vmf->page rather than variable page which doesn't exist yet upstream] Fixes: 1cf7a1518aef ("afs: Implement shared-writeable mmap") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: David Howells Tested-by: kafs-testing@auristor.com cc: linux-afs@lists.infradead.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20210320054104.1300774-4-willy@infradead.org Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c2407cf7d22d0c0d94cf20342b3b8f06f1d904e7 [1] --- fs/afs/write.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index c9195fc67fd8..eb737ed63afb 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -851,8 +851,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) fscache_wait_on_page_write(vnode->cache, vmf->page); #endif - if (PageWriteback(vmf->page) && - wait_on_page_bit_killable(vmf->page, PG_writeback) < 0) + if (wait_on_page_writeback_killable(vmf->page)) return VM_FAULT_RETRY; if (lock_page_killable(vmf->page) < 0) -- cgit v1.2.3 From a185f1db59f13de73aa470559030e90e50b34d93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 23 Mar 2021 10:52:38 +0000 Subject: io_uring: do ctx sqd ejection in a clear context WARNING: CPU: 1 PID: 27907 at fs/io_uring.c:7147 io_sq_thread_park+0xb5/0xd0 fs/io_uring.c:7147 CPU: 1 PID: 27907 Comm: iou-sqp-27905 Not tainted 5.12.0-rc4-syzkaller #0 RIP: 0010:io_sq_thread_park+0xb5/0xd0 fs/io_uring.c:7147 Call Trace: io_ring_ctx_wait_and_kill+0x214/0x700 fs/io_uring.c:8619 io_uring_release+0x3e/0x50 fs/io_uring.c:8646 __fput+0x288/0x920 fs/file_table.c:280 task_work_run+0xdd/0x1a0 kernel/task_work.c:140 io_run_task_work fs/io_uring.c:2238 [inline] io_run_task_work fs/io_uring.c:2228 [inline] io_uring_try_cancel_requests+0x8ec/0xc60 fs/io_uring.c:8770 io_uring_cancel_sqpoll+0x1cf/0x290 fs/io_uring.c:8974 io_sqpoll_cancel_cb+0x87/0xb0 fs/io_uring.c:8907 io_run_task_work_head+0x58/0xb0 fs/io_uring.c:1961 io_sq_thread+0x3e2/0x18d0 fs/io_uring.c:6763 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 May happen that last ctx ref is killed in io_uring_cancel_sqpoll(), so fput callback (i.e. io_uring_release()) is enqueued through task_work, and run by same cancellation. As it's deeply nested we can't do parking or taking sqd->lock there, because its state is unclear. So avoid ctx ejection from sqd list from io_ring_ctx_wait_and_kill() and do it in a clear context in io_ring_exit_work(). Fixes: f6d54255f423 ("io_uring: halt SQO submission on ctx exit") Reported-by: syzbot+e3a3f84f5cecf61f0583@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e90df88b8ff2cabb14a7534601d35d62ab4cb8c7.1616496707.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index f3ae83a2d7bc..8c5789b96dbb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8564,6 +8564,14 @@ static void io_ring_exit_work(struct work_struct *work) struct io_tctx_node *node; int ret; + /* prevent SQPOLL from submitting new requests */ + if (ctx->sq_data) { + io_sq_thread_park(ctx->sq_data); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(ctx->sq_data); + io_sq_thread_unpark(ctx->sq_data); + } + /* * If we're doing polled IO and end up having requests being * submitted async (out-of-line), then completions can come in while @@ -8615,14 +8623,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); - /* prevent SQPOLL from submitting new requests */ - if (ctx->sq_data) { - io_sq_thread_park(ctx->sq_data); - list_del_init(&ctx->sqd_list); - io_sqd_update_thread_idle(ctx->sq_data); - io_sq_thread_unpark(ctx->sq_data); - } - io_kill_timeouts(ctx, NULL, NULL); io_poll_remove_all(ctx, NULL, NULL); -- cgit v1.2.3 From bf1c82a5389061d989f5e07f1c958db4efaf2141 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Mar 2021 08:51:10 +0000 Subject: cachefiles: do not yet allow on idmapped mounts Based on discussions (e.g. in [1]) my understanding of cachefiles and the cachefiles userspace daemon is that it creates a cache on a local filesystem (e.g. ext4, xfs etc.) for a network filesystem. The way this is done is by writing "bind" to /dev/cachefiles and pointing it to the directory to use as the cache. Currently this directory can technically also be an idmapped mount but cachefiles aren't yet fully aware of such mounts and thus don't take the idmapping into account when creating cache entries. This could leave users confused as the ownership of the files wouldn't match to what they expressed in the idmapping. Block cache files on idmapped mounts until the fscache rework is done and we have ported it to support idmapped mounts. Signed-off-by: Christian Brauner Signed-off-by: David Howells Cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/lkml/20210303161528.n3jzg66ou2wa43qb@wittgenstein [1] Link: https://lore.kernel.org/r/20210316112257.2974212-1-christian.brauner@ubuntu.com/ # v1 Link: https://listman.redhat.com/archives/linux-cachefs/2021-March/msg00044.html # v2 Link: https://lore.kernel.org/r/20210319114146.410329-1-christian.brauner@ubuntu.com/ # v3 Signed-off-by: Linus Torvalds --- fs/cachefiles/bind.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index dfb14dbddf51..38bb7764b454 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -118,6 +118,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) cache->mnt = path.mnt; root = path.dentry; + ret = -EINVAL; + if (mnt_user_ns(path.mnt) != &init_user_ns) { + pr_warn("File cache on idmapped mounts not supported"); + goto error_unsupported; + } + /* check parameters */ ret = -EOPNOTSUPP; if (d_is_negative(root) || -- cgit v1.2.3 From f5d2d23bf0d948ce0b9307b7bacae7ff0bc03c71 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 10:16:12 -0600 Subject: io-wq: fix race around pending work on teardown syzbot reports that it's triggering the warning condition on having pending work on shutdown: WARNING: CPU: 1 PID: 12346 at fs/io-wq.c:1061 io_wq_destroy fs/io-wq.c:1061 [inline] WARNING: CPU: 1 PID: 12346 at fs/io-wq.c:1061 io_wq_put+0x153/0x260 fs/io-wq.c:1072 Modules linked in: CPU: 1 PID: 12346 Comm: syz-executor.5 Not tainted 5.12.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:io_wq_destroy fs/io-wq.c:1061 [inline] RIP: 0010:io_wq_put+0x153/0x260 fs/io-wq.c:1072 Code: 8d e8 71 90 ea 01 49 89 c4 41 83 fc 40 7d 4f e8 33 4d 97 ff 42 80 7c 2d 00 00 0f 85 77 ff ff ff e9 7a ff ff ff e8 1d 4d 97 ff <0f> 0b eb b9 8d 6b ff 89 ee 09 de bf ff ff ff ff e8 18 51 97 ff 09 RSP: 0018:ffffc90001ebfb08 EFLAGS: 00010293 RAX: ffffffff81e16083 RBX: ffff888019038040 RCX: ffff88801e86b780 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000040 RBP: 1ffff1100b2f8a80 R08: ffffffff81e15fce R09: ffffed100b2f8a82 R10: ffffed100b2f8a82 R11: 0000000000000000 R12: 0000000000000000 R13: dffffc0000000000 R14: ffff8880597c5400 R15: ffff888019038000 FS: 00007f8dcd89c700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055e9a054e160 CR3: 000000001dfb8000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: io_uring_clean_tctx+0x1b7/0x210 fs/io_uring.c:8802 __io_uring_files_cancel+0x13c/0x170 fs/io_uring.c:8820 io_uring_files_cancel include/linux/io_uring.h:47 [inline] do_exit+0x258/0x2340 kernel/exit.c:780 do_group_exit+0x168/0x2d0 kernel/exit.c:922 get_signal+0x1734/0x1ef0 kernel/signal.c:2773 arch_do_signal_or_restart+0x3c/0x610 arch/x86/kernel/signal.c:811 handle_signal_work kernel/entry/common.c:147 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0xac/0x1e0 kernel/entry/common.c:208 __syscall_exit_to_user_mode_work kernel/entry/common.c:290 [inline] syscall_exit_to_user_mode+0x48/0x180 kernel/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x465f69 which shouldn't happen, but seems to be possible due to a race on whether or not the io-wq manager sees a fatal signal first, or whether the io-wq workers do. If we race with queueing work and then send a fatal signal to the owning task, and the io-wq worker sees that before the manager sets IO_WQ_BIT_EXIT, then it's possible to have the worker exit and leave work behind. Just turn the WARN_ON_ONCE() into a cancelation condition instead. Reported-by: syzbot+77a738a6bc947bf639ca@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 2dd43bdb9c7b..b7c1fa932cb3 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1063,7 +1063,11 @@ static void io_wq_destroy(struct io_wq *wq) for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - WARN_ON_ONCE(!wq_list_empty(&wqe->work_list)); + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_all, + .cancel_all = true, + }; + io_wqe_cancel_pending_work(wqe, &match); kfree(wqe); } io_wq_put_hash(wq->hash); -- cgit v1.2.3 From c1b2028315c6b15e8d6725e0d5884b15887d3daa Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Wed, 24 Mar 2021 21:37:32 -0700 Subject: squashfs: fix inode lookup sanity checks When mouting a squashfs image created without inode compression it fails with: "unable to read inode lookup table" It turns out that the BLOCK_OFFSET is missing when checking the SQUASHFS_METADATA_SIZE agaist the actual size. Link: https://lkml.kernel.org/r/20210226092903.1473545-1-sean@geanix.com Fixes: eabac19e40c0 ("squashfs: add more sanity checks in inode lookup") Signed-off-by: Sean Nyekjaer Acked-by: Phillip Lougher Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/export.c | 8 ++++++-- fs/squashfs/squashfs_fs.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index eb02072d28dd..723763746238 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -152,14 +152,18 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); - if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + if (start >= end + || (end - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); - if (start >= lookup_table_start || (lookup_table_start - start) > SQUASHFS_METADATA_SIZE) { + if (start >= lookup_table_start || + (lookup_table_start - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 8d64edb80ebf..b3fdc8212c5f 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -17,6 +17,7 @@ /* size of metadata (inode and directory) blocks */ #define SQUASHFS_METADATA_SIZE 8192 +#define SQUASHFS_BLOCK_OFFSET 2 /* default size of block device I/O */ #ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE -- cgit v1.2.3 From 8b44ca2b634527151af07447a8090a5f3a043321 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Wed, 24 Mar 2021 21:37:35 -0700 Subject: squashfs: fix xattr id and id lookup sanity checks The checks for maximum metadata block size is missing SQUASHFS_BLOCK_OFFSET (the two byte length count). Link: https://lkml.kernel.org/r/2069685113.2081245.1614583677427@webmail.123-reg.co.uk Fixes: f37aa4c7366e23f ("squashfs: add more sanity checks in id lookup") Signed-off-by: Phillip Lougher Cc: Sean Nyekjaer Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/id.c | 6 ++++-- fs/squashfs/xattr_id.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index 11581bf31af4..ea5387679723 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -97,14 +97,16 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb, start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); - if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + if (start >= end || (end - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); - if (start >= id_table_start || (id_table_start - start) > SQUASHFS_METADATA_SIZE) { + if (start >= id_table_start || (id_table_start - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index ead66670b41a..087cab8c78f4 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -109,14 +109,16 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); - if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + if (start >= end || (end - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); - if (start >= table_start || (table_start - start) > SQUASHFS_METADATA_SIZE) { + if (start >= table_start || (table_start - start) > + (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } -- cgit v1.2.3 From ff132c5f93c06bd4432bbab5c369e468653bdec4 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 25 Mar 2021 08:51:13 -0400 Subject: gfs2: report "already frozen/thawed" errors Before this patch, gfs2's freeze function failed to report an error when the target file system was already frozen as it should (and as generic vfs function freeze_super does. Similarly, gfs2's thaw function failed to report an error when trying to thaw a file system that is not frozen, as vfs function thaw_super does. The errors were checked, but it always returned a 0 return code. This patch adds the missing error return codes to gfs2 freeze and thaw. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9e91c9d92bd6..8fb9602d79b4 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -752,11 +752,13 @@ void gfs2_freeze_func(struct work_struct *work) static int gfs2_freeze(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; - int error = 0; + int error; mutex_lock(&sdp->sd_freeze_mutex); - if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) + if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) { + error = -EBUSY; goto out; + } for (;;) { if (gfs2_withdrawn(sdp)) { @@ -797,10 +799,10 @@ static int gfs2_unfreeze(struct super_block *sb) struct gfs2_sbd *sdp = sb->s_fs_info; mutex_lock(&sdp->sd_freeze_mutex); - if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN || + if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN || !gfs2_holder_initialized(&sdp->sd_freeze_gh)) { mutex_unlock(&sdp->sd_freeze_mutex); - return 0; + return -EINVAL; } gfs2_freeze_unlock(&sdp->sd_freeze_gh); -- cgit v1.2.3 From 90b8749022bbdd0c94a13182a78f4903b98fd0d7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 19:05:14 +0000 Subject: io_uring: maintain CQE order of a failed link Arguably we want CQEs of linked requests be in a strict order of submission as it always was. Now if init of a request fails its CQE may be posted before all prior linked requests including the head of the link. Fix it by failing it last. Fixes: de59bc104c24f ("io_uring: fail links more in io_submit_sqe()") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b7a96b05832e7ab23ad55f84092a2548c4a888b0.1616699075.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8c5789b96dbb..54ea561db4a5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6480,8 +6480,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { fail_req: - io_put_req(req); - io_req_complete(req, ret); if (link->head) { /* fail even hard links since we don't submit */ link->head->flags |= REQ_F_FAIL_LINK; @@ -6489,6 +6487,8 @@ fail_req: io_req_complete(link->head, -ECANCELED); link->head = NULL; } + io_put_req(req); + io_req_complete(req, ret); return ret; } ret = io_req_prep(req, sqe); -- cgit v1.2.3 From 7f6c411c9b50cfab41cc798e003eff27608c7016 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 25 Mar 2021 14:12:34 -0400 Subject: hostfs: fix memory handling in follow_link() 1) argument should not be freed in any case - the caller already has it as ->s_fs_info (and uses it a lot afterwards) 2) allocate readlink buffer with kmalloc() - the caller has no way to tell if it's got that (on absolute symlink) or a result of kasprintf(). Sure, for SLAB and SLUB kfree() works on results of kmem_cache_alloc(), but that's not documented anywhere, might change in the future *and* is already not true for SLOB. Fixes: 52b209f7b848 ("get rid of hostfs_read_inode()") Signed-off-by: Al Viro --- fs/hostfs/hostfs_kern.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 29e407762626..743a005a5c64 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -144,7 +144,7 @@ static char *follow_link(char *link) char *name, *resolved, *end; int n; - name = __getname(); + name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { n = -ENOMEM; goto out_free; @@ -173,12 +173,11 @@ static char *follow_link(char *link) goto out_free; } - __putname(name); - kfree(link); + kfree(name); return resolved; out_free: - __putname(name); + kfree(name); return ERR_PTR(n); } -- cgit v1.2.3 From 45a4546c6167a2da348a31ca439d8a8ff773b6ea Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Thu, 25 Mar 2021 12:34:54 +0000 Subject: cifs: Adjust key sizes and key generation routines for AES256 encryption For AES256 encryption (GCM and CCM), we need to adjust the size of a few fields to 32 bytes instead of 16 to accommodate the larger keys. Also, the L value supplied to the key generator needs to be changed from to 256 when these algorithms are used. Keeping the ioctl struct for dumping keys of the same size for now. Will send out a different patch for that one. Signed-off-by: Shyam Prasad N Reviewed-by: Ronnie Sahlberg CC: # v5.10+ Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 4 ++-- fs/cifs/cifspdu.h | 5 +++++ fs/cifs/smb2glob.h | 1 + fs/cifs/smb2ops.c | 9 +++++---- fs/cifs/smb2transport.c | 37 ++++++++++++++++++++++++++++--------- 5 files changed, 41 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 31fc8695abd6..67c056a9a519 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -919,8 +919,8 @@ struct cifs_ses { bool binding:1; /* are we binding the session? */ __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; - __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; - __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE]; __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 64fe5a47b5e8..9adc74bd9f8f 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -147,6 +147,11 @@ */ #define SMB3_SIGN_KEY_SIZE (16) +/* + * Size of the smb3 encryption/decryption keys + */ +#define SMB3_ENC_DEC_KEY_SIZE (32) + #define CIFS_CLIENT_CHALLENGE_SIZE (8) #define CIFS_SERVER_CHALLENGE_SIZE (8) #define CIFS_HMAC_MD5_HASH_SIZE (16) diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 99a1951a01ec..d9a990c99121 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -58,6 +58,7 @@ #define SMB2_HMACSHA256_SIZE (32) #define SMB2_CMACAES_SIZE (16) #define SMB3_SIGNKEY_SIZE (16) +#define SMB3_GCM128_CRYPTKEY_SIZE (16) #define SMB3_GCM256_CRYPTKEY_SIZE (32) /* Maximum buffer size value we can send with 1 credit */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 9bae7e8deb09..3e94f83897e9 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4158,7 +4158,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) if (ses->Suid == ses_id) { ses_enc_key = enc ? ses->smb3encryptionkey : ses->smb3decryptionkey; - memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE); + memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -4185,7 +4185,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, int rc = 0; struct scatterlist *sg; u8 sign[SMB2_SIGNATURE_SIZE] = {}; - u8 key[SMB3_SIGN_KEY_SIZE]; + u8 key[SMB3_ENC_DEC_KEY_SIZE]; struct aead_request *req; char *iv; unsigned int iv_len; @@ -4209,10 +4209,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, tfm = enc ? server->secmech.ccmaesencrypt : server->secmech.ccmaesdecrypt; - if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE); else - rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); + rc = crypto_aead_setkey(tfm, key, SMB3_GCM128_CRYPTKEY_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index ebccd71cc60a..e6fa76ab70be 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -298,7 +298,8 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; - __u8 L[4] = {0, 0, 0, 128}; + __u8 L128[4] = {0, 0, 0, 128}; + __u8 L256[4] = {0, 0, 1, 0}; int rc = 0; unsigned char prfhash[SMB2_HMACSHA256_SIZE]; unsigned char *hashptr = prfhash; @@ -354,8 +355,14 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L, 4); + if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + L256, 4); + } else { + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + L128, 4); + } if (rc) { cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; @@ -390,6 +397,9 @@ generate_smb3signingkey(struct cifs_ses *ses, const struct derivation_triplet *ptriplet) { int rc; +#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS + struct TCP_Server_Info *server = ses->server; +#endif /* * All channels use the same encryption/decryption keys but @@ -422,11 +432,11 @@ generate_smb3signingkey(struct cifs_ses *ses, rc = generate_key(ses, ptriplet->encryption.label, ptriplet->encryption.context, ses->smb3encryptionkey, - SMB3_SIGN_KEY_SIZE); + SMB3_ENC_DEC_KEY_SIZE); rc = generate_key(ses, ptriplet->decryption.label, ptriplet->decryption.context, ses->smb3decryptionkey, - SMB3_SIGN_KEY_SIZE); + SMB3_ENC_DEC_KEY_SIZE); if (rc) return rc; } @@ -442,14 +452,23 @@ generate_smb3signingkey(struct cifs_ses *ses, */ cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid), &ses->Suid); + cifs_dbg(VFS, "Cipher type %d\n", server->cipher_type); cifs_dbg(VFS, "Session Key %*ph\n", SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response); cifs_dbg(VFS, "Signing Key %*ph\n", SMB3_SIGN_KEY_SIZE, ses->smb3signingkey); - cifs_dbg(VFS, "ServerIn Key %*ph\n", - SMB3_SIGN_KEY_SIZE, ses->smb3encryptionkey); - cifs_dbg(VFS, "ServerOut Key %*ph\n", - SMB3_SIGN_KEY_SIZE, ses->smb3decryptionkey); + if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { + cifs_dbg(VFS, "ServerIn Key %*ph\n", + SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3encryptionkey); + cifs_dbg(VFS, "ServerOut Key %*ph\n", + SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3decryptionkey); + } else { + cifs_dbg(VFS, "ServerIn Key %*ph\n", + SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3encryptionkey); + cifs_dbg(VFS, "ServerOut Key %*ph\n", + SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3decryptionkey); + } #endif return rc; } -- cgit v1.2.3 From 3bffbe9e0b2721bb62d226a4d4211bddae52b00a Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 26 Mar 2021 10:28:16 +0000 Subject: cifs: Fix chmod with modefromsid when an older ACE already exists. My recent fixes to cifsacl to maintain inherited ACEs had regressed modefromsid when an older ACL already exists. Found testing xfstest 495 with modefromsid mount option Fixes: f5065508897a ("cifs: Retain old ACEs when converting between mode bits and ACL") Signed-off-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 2be22a5c690f..d178cf85e926 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1130,8 +1130,7 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, } /* If it's any one of the ACE we're replacing, skip! */ - if (!mode_from_sid && - ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || + if (((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || (compare_sids(&pntace->sid, pownersid) == 0) || (compare_sids(&pntace->sid, pgrpsid) == 0) || (compare_sids(&pntace->sid, &sid_everyone) == 0) || -- cgit v1.2.3 From cee8f4f6fcabfdf229542926128e9874d19016d5 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 25 Mar 2021 16:26:35 +1000 Subject: cifs: revalidate mapping when we open files for SMB1 POSIX RHBZ: 1933527 Under SMB1 + POSIX, if an inode is reused on a server after we have read and cached a part of a file, when we then open the new file with the re-cycled inode there is a chance that we may serve the old data out of cache to the application. This only happens for SMB1 (deprecated) and when posix are used. The simplest solution to avoid this race is to force a revalidate on smb1-posix open. Signed-off-by: Ronnie Sahlberg Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 26de4329d161..042e24aad410 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -165,6 +165,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, goto posix_open_ret; } } else { + cifs_revalidate_mapping(*pinode); cifs_fattr_to_inode(*pinode, &fattr); } -- cgit v1.2.3 From 219481a8f90ec3a5eed9638fb35609e4b1aeece7 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Fri, 19 Mar 2021 14:57:11 +0100 Subject: cifs: Silently ignore unknown oplock break handle Make SMB2 not print out an error when an oplock break is received for an unknown handle, similar to SMB1. The debug message which is printed for these unknown handles may also be misleading, so fix that too. The SMB2 lease break path is not affected by this patch. Without this, a program which writes to a file from one thread, and opens, reads, and writes the same file from another thread triggers the below errors several times a minute when run against a Samba server configured with "smb2 leases = no". CIFS: VFS: \\192.168.0.1 No task to wake, unknown frame received! NumMids 2 00000000: 424d53fe 00000040 00000000 00000012 .SMB@........... 00000010: 00000001 00000000 ffffffff ffffffff ................ 00000020: 00000000 00000000 00000000 00000000 ................ 00000030: 00000000 00000000 00000000 00000000 ................ Signed-off-by: Vincent Whitchurch Reviewed-by: Tom Talpey Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/smb2misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b50164e2c88d..aac384f69f74 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -754,8 +754,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) } } spin_unlock(&cifs_tcp_ses_lock); - cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); - return false; + cifs_dbg(FYI, "No file id matched, oplock break ignored\n"); + return true; } void -- cgit v1.2.3 From cfc63fc8126a93cbf95379bc4cad79a7b15b6ece Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 26 Mar 2021 18:41:55 -0500 Subject: smb3: fix cached file size problems in duplicate extents (reflink) There were two problems (one of which could cause data corruption) that were noticed with duplicate extents (ie reflink) when debugging why various xfstests were being incorrectly skipped (e.g. generic/138, generic/140, generic/142). First, we were not updating the file size locally in the cache when extending a file due to reflink (it would refresh after actimeo expires) but xfstest was checking the size immediately which was still 0 so caused the test to be skipped. Second, we were setting the target file size (which could shrink the file) in all cases to the end of the reflinked range rather than only setting the target file size when reflink would extend the file. CC: Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 3e94f83897e9..f703204fb185 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2038,6 +2038,7 @@ smb2_duplicate_extents(const unsigned int xid, { int rc; unsigned int ret_data_len; + struct inode *inode; struct duplicate_extents_to_file dup_ext_buf; struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); @@ -2054,10 +2055,21 @@ smb2_duplicate_extents(const unsigned int xid, cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n", src_off, dest_off, len); - rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); - if (rc) - goto duplicate_extents_out; + inode = d_inode(trgtfile->dentry); + if (inode->i_size < dest_off + len) { + rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); + if (rc) + goto duplicate_extents_out; + /* + * Although also could set plausible allocation size (i_blocks) + * here in addition to setting the file size, in reflink + * it is likely that the target file is sparse. Its allocation + * size will be queried on next revalidate, but it is important + * to make sure that file's cached size is updated immediately + */ + cifs_setsize(inode, dest_off + len); + } rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, -- cgit v1.2.3 From dbe1bdbb39db7dfe80a903f0d267f62cf3f093d2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:16:06 -0600 Subject: io_uring: handle signals for IO threads like a normal thread We go through various hoops to disallow signals for the IO threads, but there's really no reason why we cannot just allow them. The IO threads never return to userspace like a normal thread, and hence don't go through normal signal processing. Instead, just check for a pending signal as part of the work loop, and call get_signal() to handle it for us if anything is pending. With that, we can support receiving signals, including special ones like SIGSTOP. Acked-by: "Eric W. Biederman" Signed-off-by: Jens Axboe --- fs/io-wq.c | 20 ++++++++++++++------ fs/io_uring.c | 9 ++++++--- 2 files changed, 20 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index b7c1fa932cb3..7434eb40ca8c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "../kernel/sched/sched.h" #include "io-wq.h" @@ -503,10 +502,15 @@ loop: if (io_flush_signals()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); - if (try_to_freeze() || ret) - continue; - if (fatal_signal_pending(current)) + if (signal_pending(current)) { + struct ksignal ksig; + + if (!get_signal(&ksig)) + continue; break; + } + if (ret) + continue; /* timed out, exit unless we're the fixed worker */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || !(worker->flags & IO_WORKER_F_FIXED)) @@ -714,9 +718,13 @@ static int io_wq_manager(void *data) set_current_state(TASK_INTERRUPTIBLE); io_wq_check_workers(wq); schedule_timeout(HZ); - try_to_freeze(); - if (fatal_signal_pending(current)) + if (signal_pending(current)) { + struct ksignal ksig; + + if (!get_signal(&ksig)) + continue; set_bit(IO_WQ_BIT_EXIT, &wq->state); + } } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); io_wq_check_workers(wq); diff --git a/fs/io_uring.c b/fs/io_uring.c index 54ea561db4a5..2d43f7b87083 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,7 +78,6 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include @@ -6765,8 +6764,13 @@ static int io_sq_thread(void *data) timeout = jiffies + sqd->sq_thread_idle; continue; } - if (fatal_signal_pending(current)) + if (signal_pending(current)) { + struct ksignal ksig; + + if (!get_signal(&ksig)) + continue; break; + } sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -6809,7 +6813,6 @@ static int io_sq_thread(void *data) mutex_unlock(&sqd->lock); schedule(); - try_to_freeze(); mutex_lock(&sqd->lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); -- cgit v1.2.3 From 1ee4160c73b2102a52bc97a4128a89c34821414f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:42 +0000 Subject: io_uring: fix timeout cancel return code When we cancel a timeout we should emit a sensible return code, like -ECANCELED but not 0, otherwise it may trick users. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7b0ad1065e3bd1994722702bd0ba9e7bc9b0683b.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 2d43f7b87083..229ab9bfb45b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1247,7 +1247,7 @@ static void io_queue_async_work(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_kill_timeout(struct io_kiocb *req) +static void io_kill_timeout(struct io_kiocb *req, int status) { struct io_timeout_data *io = req->async_data; int ret; @@ -1257,7 +1257,7 @@ static void io_kill_timeout(struct io_kiocb *req) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_cqring_fill_event(req, 0); + io_cqring_fill_event(req, status); io_put_req_deferred(req, 1); } } @@ -1274,7 +1274,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { if (io_match_task(req, tsk, files)) { - io_kill_timeout(req); + io_kill_timeout(req, -ECANCELED); canceled++; } } @@ -1326,7 +1326,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) break; list_del_init(&req->timeout.list); - io_kill_timeout(req); + io_kill_timeout(req, 0); } while (!list_empty(&ctx->timeout_list)); ctx->cq_last_tm_flush = seq; -- cgit v1.2.3 From 80c4cbdb5ee604712e59fe304d7bf084b562f705 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:43 +0000 Subject: io_uring: do post-completion chore on t-out cancel Don't forget about io_commit_cqring() + io_cqring_ev_posted() after exit/exec cancelling timeouts. Both functions declared only after io_kill_timeouts(), so to avoid tons of forward declarations move it down. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/72ace588772c0f14834a6a4185d56c445a366fb4.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 229ab9bfb45b..8498f74595f3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1262,26 +1262,6 @@ static void io_kill_timeout(struct io_kiocb *req, int status) } } -/* - * Returns true if we found and killed one or more timeouts - */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, - struct files_struct *files) -{ - struct io_kiocb *req, *tmp; - int canceled = 0; - - spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_match_task(req, tsk, files)) { - io_kill_timeout(req, -ECANCELED); - canceled++; - } - } - spin_unlock_irq(&ctx->completion_lock); - return canceled != 0; -} - static void __io_queue_deferred(struct io_ring_ctx *ctx) { do { @@ -8611,6 +8591,28 @@ static void io_ring_exit_work(struct work_struct *work) io_ring_ctx_free(ctx); } +/* Returns true if we found and killed one or more timeouts */ +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) +{ + struct io_kiocb *req, *tmp; + int canceled = 0; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + if (io_match_task(req, tsk, files)) { + io_kill_timeout(req, -ECANCELED); + canceled++; + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + if (canceled != 0) + io_cqring_ev_posted(ctx); + return canceled != 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; -- cgit v1.2.3 From 2482b58ffbdc80cfaae969ad19cb32803056505b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:44 +0000 Subject: io_uring: don't cancel-track common timeouts Don't account usual timeouts (i.e. not linked) as REQ_F_INFLIGHT but keep behaviour prior to dd59a3d595cc1 ("io_uring: reliably cancel linked timeouts"). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/104441ef5d97e3932113d44501fda0df88656b83.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8498f74595f3..87d198d223a2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5563,7 +5563,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - io_req_track_inflight(req); + if (is_timeout_link) + io_req_track_inflight(req); return 0; } -- cgit v1.2.3 From 78d9d7c2a331fb7a68a86e53ef7e12966459e0c5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:45 +0000 Subject: io_uring: don't cancel extra on files match As tasks always wait and kill their io-wq on exec/exit, files are of no more concern to us, so we don't need to specifically cancel them by hand in those cases. Moreover we should not, because io_match_task() looks at req->task->files now, which is always true and so leads to extra cancellations, that wasn't a case before per-task io-wq. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0566c1de9b9dd417f5de345c817ca953580e0e2e.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 87d198d223a2..880abd8b6d31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1094,8 +1094,6 @@ static bool io_match_task(struct io_kiocb *head, io_for_each_link(req, head) { if (req->flags & REQ_F_INFLIGHT) return true; - if (req->task->files == files) - return true; } return false; } -- cgit v1.2.3 From 2b8ed1c94182dbbd0163d0eb443a934cbf6b0d85 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 26 Mar 2021 19:52:51 +0000 Subject: io_uring: remove unsued assignment to pointer io There is an assignment to io that is never read after the assignment, the assignment is redundant and can be removed. Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 880abd8b6d31..1949b80677e7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4798,7 +4798,6 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = -ENOMEM; goto out; } - io = req->async_data; memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } -- cgit v1.2.3 From 5a978dcfc0f054e4f6983a0a26355a65e34708cb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Mar 2021 09:59:30 +0000 Subject: io_uring: always go for cancellation spin on exec Always try to do cancellation in __io_uring_task_cancel() at least once, so it actually goes and cleans its sqpoll tasks (i.e. via io_sqpoll_cancel_sync()), otherwise sqpoll task may submit new requests after cancellation and it's racy for many reasons. Fixes: 521d6a737a31c ("io_uring: cancel sqpoll via task_work") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0a21bd6d794bb1629bc906dd57a57b2c2985a8ac.1616839147.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1949b80677e7..a4a944da95a0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9002,6 +9002,8 @@ void __io_uring_task_cancel(void) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); + __io_uring_files_cancel(NULL); + do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); -- cgit v1.2.3 From 51520426f4bc3e61cbbf7a39ccf4e411b665002d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Mar 2021 11:39:29 +0100 Subject: io_uring: handle setup-failed ctx in kill_timeouts general protection fault, probably for non-canonical address 0xdffffc0000000018: 0000 [#1] KASAN: null-ptr-deref in range [0x00000000000000c0-0x00000000000000c7] RIP: 0010:io_commit_cqring+0x37f/0xc10 fs/io_uring.c:1318 Call Trace: io_kill_timeouts+0x2b5/0x320 fs/io_uring.c:8606 io_ring_ctx_wait_and_kill+0x1da/0x400 fs/io_uring.c:8629 io_uring_create fs/io_uring.c:9572 [inline] io_uring_setup+0x10da/0x2ae0 fs/io_uring.c:9599 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae It can get into wait_and_kill() before setting up ctx->rings, and hence io_commit_cqring() fails. Mimic poll cancel and do it only when we completed events, there can't be any requests if it failed before initialising rings. Fixes: 80c4cbdb5ee60 ("io_uring: do post-completion chore on t-out cancel") Reported-by: syzbot+0e905eb8228070c457a0@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/660261a48f0e7abf260c8e43c87edab3c16736fa.1617014345.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index a4a944da95a0..088a9d3c420a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8603,9 +8603,9 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, canceled++; } } - io_commit_cqring(ctx); + if (canceled != 0) + io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); - if (canceled != 0) io_cqring_ev_posted(ctx); return canceled != 0; -- cgit v1.2.3 From 82734c5b1b24c020d701cf90ccb075e43a5ccb07 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 29 Mar 2021 06:52:44 -0600 Subject: io_uring: drop sqd lock before handling signals for SQPOLL Don't call into get_signal() with the sqd mutex held, it'll fail if we're freezing the task and we'll get complaints on locks still being held: ==================================== WARNING: iou-sqp-8386/8387 still has locks held! 5.12.0-rc4-syzkaller #0 Not tainted ------------------------------------ 1 lock held by iou-sqp-8386/8387: #0: ffff88801e1d2470 (&sqd->lock){+.+.}-{3:3}, at: io_sq_thread+0x24c/0x13a0 fs/io_uring.c:6731 stack backtrace: CPU: 1 PID: 8387 Comm: iou-sqp-8386 Not tainted 5.12.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 try_to_freeze include/linux/freezer.h:66 [inline] get_signal+0x171a/0x2150 kernel/signal.c:2576 io_sq_thread+0x8d2/0x13a0 fs/io_uring.c:6748 Fold the get_signal() case in with the parking checks, as we need to drop the lock in both cases, and since we need to be checking for parking when juggling the lock anyway. Reported-by: syzbot+796d767eb376810256f5@syzkaller.appspotmail.com Fixes: dbe1bdbb39db ("io_uring: handle signals for IO threads like a normal thread") Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 088a9d3c420a..6d7a1b69712b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6733,22 +6733,25 @@ static int io_sq_thread(void *data) int ret; bool cap_entries, sqt_spin, needs_sched; - if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || + signal_pending(current)) { + bool did_sig = false; + mutex_unlock(&sqd->lock); + if (signal_pending(current)) { + struct ksignal ksig; + + did_sig = get_signal(&ksig); + } cond_resched(); mutex_lock(&sqd->lock); + if (did_sig) + break; io_run_task_work(); io_run_task_work_head(&sqd->park_task_work); timeout = jiffies + sqd->sq_thread_idle; continue; } - if (signal_pending(current)) { - struct ksignal ksig; - - if (!get_signal(&ksig)) - continue; - break; - } sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { -- cgit v1.2.3 From 5e46d1b78a03d52306f21f77a4e4a144b6d31486 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 21 Mar 2021 23:37:49 +0900 Subject: reiserfs: update reiserfs_xattrs_initialized() condition syzbot is reporting NULL pointer dereference at reiserfs_security_init() [1], for commit ab17c4f02156c4f7 ("reiserfs: fixup xattr_root caching") is assuming that REISERFS_SB(s)->xattr_root != NULL in reiserfs_xattr_jcreate_nblocks() despite that commit made REISERFS_SB(sb)->priv_root != NULL && REISERFS_SB(s)->xattr_root == NULL case possible. I guess that commit 6cb4aff0a77cc0e6 ("reiserfs: fix oops while creating privroot with selinux enabled") wanted to check xattr_root != NULL before reiserfs_xattr_jcreate_nblocks(), for the changelog is talking about the xattr root. The issue is that while creating the privroot during mount reiserfs_security_init calls reiserfs_xattr_jcreate_nblocks which dereferences the xattr root. The xattr root doesn't exist, so we get an oops. Therefore, update reiserfs_xattrs_initialized() to check both the privroot and the xattr root. Link: https://syzkaller.appspot.com/bug?id=8abaedbdeb32c861dc5340544284167dd0e46cde # [1] Reported-and-tested-by: syzbot Signed-off-by: Tetsuo Handa Fixes: 6cb4aff0a77c ("reiserfs: fix oops while creating privroot with selinux enabled") Acked-by: Jeff Mahoney Acked-by: Jan Kara Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index 9b3b06da568c..e47fde1182de 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -44,7 +44,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec); static inline int reiserfs_xattrs_initialized(struct super_block *sb) { - return REISERFS_SB(sb)->priv_root != NULL; + return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root; } #define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header)) -- cgit v1.2.3 From 4b982bd0f383db9132e892c0c5144117359a6289 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Apr 2021 08:38:34 -0600 Subject: io_uring: don't mark S_ISBLK async work as unbounded S_ISBLK is marked as unbounded work for async preparation, because it doesn't match S_ISREG. That is incorrect, as any read/write to a block device is also a bounded operation. Fix it up and ensure that S_ISBLK isn't marked unbounded. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6d7a1b69712b..a16b7df934d1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1213,7 +1213,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); - } else { + } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } -- cgit v1.2.3 From 696ee88a7c50f96573f98aa76cc74286033140c1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 Apr 2021 09:55:04 +0100 Subject: io_uring/io-wq: protect against sprintf overflow task_pid may be large enough to not fit into the left space of TASK_COMM_LEN-sized buffers and overflow in sprintf. We not so care about uniqueness, so replace it with safer snprintf(). Reported-by: Alexey Dobriyan Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1702c6145d7e1c46fbc382f28334c02e1a3d3994.1617267273.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 4 ++-- fs/io_uring.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 7434eb40ca8c..433c4d3c3c1c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -484,7 +484,7 @@ static int io_wqe_worker(void *data) worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); io_wqe_inc_running(worker); - sprintf(buf, "iou-wrk-%d", wq->task_pid); + snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task_pid); set_task_comm(current, buf); while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { @@ -711,7 +711,7 @@ static int io_wq_manager(void *data) char buf[TASK_COMM_LEN]; int node; - sprintf(buf, "iou-mgr-%d", wq->task_pid); + snprintf(buf, sizeof(buf), "iou-mgr-%d", wq->task_pid); set_task_comm(current, buf); do { diff --git a/fs/io_uring.c b/fs/io_uring.c index a16b7df934d1..4a6701b5065e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6718,7 +6718,7 @@ static int io_sq_thread(void *data) char buf[TASK_COMM_LEN]; DEFINE_WAIT(wait); - sprintf(buf, "iou-sqp-%d", sqd->task_pid); + snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); current->pf_io_worker = NULL; -- cgit v1.2.3 From 07204f21577a1d882f0259590c3553fe6a476381 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 Apr 2021 12:18:48 +0100 Subject: io_uring: fix EIOCBQUEUED iter revert iov_iter_revert() is done in completion handlers that happensf before read/write returns -EIOCBQUEUED, no need to repeat reverting afterwards. Moreover, even though it may appear being just a no-op, it's actually races with 1) user forging a new iovec of a different size 2) reissue, that is done via io-wq continues completely asynchronously. Fixes: 3e6a0d3c7571c ("io_uring: fix -EAGAIN retry with IOPOLL") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4a6701b5065e..717942474fa9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3284,8 +3284,6 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(req, iter); if (ret == -EIOCBQUEUED) { - if (req->async_data) - iov_iter_revert(iter, io_size - iov_iter_count(iter)); goto out_free; } else if (ret == -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ @@ -3418,8 +3416,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) /* no retry on NONBLOCK nor RWF_NOWAIT */ if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) goto done; - if (ret2 == -EIOCBQUEUED && req->async_data) - iov_iter_revert(iter, io_size - iov_iter_count(iter)); if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) -- cgit v1.2.3 From 9b5b872215fe6d1ca6a1ef411f130bd58e269012 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 2 Apr 2021 10:29:36 +0200 Subject: file: fix close_range() for unshare+cloexec syzbot reported a bug when putting the last reference to a tasks file descriptor table. Debugging this showed we didn't recalculate the current maximum fd number for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC after we unshared the file descriptors table. So max_fd could exceed the current fdtable maximum causing us to set excessive bits. As a concrete example, let's say the user requested everything from fd 4 to ~0UL to be closed and their current fdtable size is 256 with their highest open fd being 4. With CLOSE_RANGE_UNSHARE the caller will end up with a new fdtable which has room for 64 file descriptors since that is the lowest fdtable size we accept. But now max_fd will still point to 255 and needs to be adjusted. Fix this by retrieving the correct maximum fd value in __range_cloexec(). Reported-by: syzbot+283ce5a46486d6acdbaf@syzkaller.appspotmail.com Fixes: 582f1fb6b721 ("fs, close_range: add flag CLOSE_RANGE_CLOEXEC") Fixes: fec8a6a69103 ("close_range: unshare all fds for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC") Cc: Christoph Hellwig Cc: Giuseppe Scrivano Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- fs/file.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index f3a4bac2cbe9..f633348029a5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -629,17 +629,30 @@ int close_fd(unsigned fd) } EXPORT_SYMBOL(close_fd); /* for ksys_close() */ +/** + * last_fd - return last valid index into fd table + * @cur_fds: files struct + * + * Context: Either rcu read lock or files_lock must be held. + * + * Returns: Last valid index into fdtable. + */ +static inline unsigned last_fd(struct fdtable *fdt) +{ + return fdt->max_fds - 1; +} + static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; - if (fd > max_fd) - return; - + /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); - bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); + max_fd = min(last_fd(fdt), max_fd); + if (fd <= max_fd) + bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } -- cgit v1.2.3 From f8b78caf21d5bc3fcfc40c18898f9d52ed1451a5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 20 Nov 2020 17:10:28 +0000 Subject: block: don't ignore REQ_NOWAIT for direct IO If IOCB_NOWAIT is set on submission, then that needs to get propagated to REQ_NOWAIT on the block side. Otherwise we completely lose this information, and any issuer of IOCB_NOWAIT IO will potentially end up blocking on eg request allocation on the storage side. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 28d583fcdc2c..09d6f7229db9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -275,6 +275,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(&bio, iocb); @@ -428,6 +430,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; -- cgit v1.2.3 From 230d50d448acb6639991440913299e50cacf1daf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Apr 2021 20:41:15 -0600 Subject: io_uring: move reissue into regular IO path It's non-obvious how retry is done for block backed files, when it happens off the kiocb done path. It also makes it tricky to deal with the iov_iter handling. Just mark the req as needing a reissue, and handling it from the submission path instead. This makes it directly obvious that we're not re-importing the iovec from userspace past the submit point, and it means that we can just reuse our usual -EAGAIN retry path from the read/write handling. At some point in the future, we'll gain the ability to always reliably return -EAGAIN through the stack. A previous attempt on the block side didn't pan out and got reverted, hence the need to check for this information out-of-band right now. Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 717942474fa9..8be542050648 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -697,6 +697,7 @@ enum { REQ_F_NO_FILE_TABLE_BIT, REQ_F_LTIMEOUT_ACTIVE_BIT, REQ_F_COMPLETE_INLINE_BIT, + REQ_F_REISSUE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -740,6 +741,8 @@ enum { REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), + /* caller should reissue async */ + REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), }; struct async_poll { @@ -2503,8 +2506,10 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, if (req->rw.kiocb.ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req)) + if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE; return; + } if (res != req->result) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) @@ -3283,9 +3288,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(req, iter); - if (ret == -EIOCBQUEUED) { - goto out_free; - } else if (ret == -EAGAIN) { + if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; @@ -3295,6 +3298,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) /* some cases will consume bytes even on error returns */ iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = 0; + } else if (ret == -EIOCBQUEUED) { + goto out_free; } else if (ret <= 0 || ret == io_size || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) { /* read all, failed, already did sync or don't want to retry */ @@ -3407,6 +3412,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; + if (req->flags & REQ_F_REISSUE) + ret2 = -EAGAIN; + /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. @@ -6160,6 +6168,7 @@ static void io_wq_submit_work(struct io_wq_work *work) ret = -ECANCELED; if (!ret) { + req->flags &= ~REQ_F_REISSUE; do { ret = io_issue_sqe(req, 0); /* -- cgit v1.2.3 From e82ad4853948382d37ac512b27a3e70b6f01c103 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 2 Apr 2021 19:45:34 -0600 Subject: io_uring: fix !CONFIG_BLOCK compilation failure kernel test robot correctly pinpoints a compilation failure if CONFIG_BLOCK isn't set: fs/io_uring.c: In function '__io_complete_rw': >> fs/io_uring.c:2509:48: error: implicit declaration of function 'io_rw_should_reissue'; did you mean 'io_rw_reissue'? [-Werror=implicit-function-declaration] 2509 | if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { | ^~~~~~~~~~~~~~~~~~~~ | io_rw_reissue cc1: some warnings being treated as errors Ensure that we have a stub declaration of io_rw_should_reissue() for !CONFIG_BLOCK. Fixes: 230d50d448ac ("io_uring: move reissue into regular IO path") Reported-by: kernel test robot Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8be542050648..65a17d560a73 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2479,6 +2479,11 @@ static bool io_rw_should_reissue(struct io_kiocb *req) return false; return true; } +#else +static bool io_rw_should_reissue(struct io_kiocb *req) +{ + return false; +} #endif static bool io_rw_reissue(struct io_kiocb *req) -- cgit v1.2.3 From 7d01ef7585c07afaf487759a48486228cd065726 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 6 Apr 2021 12:33:07 -0400 Subject: Make sure nd->path.mnt and nd->path.dentry are always valid pointers Initialize them in set_nameidata() and make sure that terminate_walk() clears them once the pointers become potentially invalid (i.e. we leave RCU mode or drop them in non-RCU one). Currently we have "path_init() always initializes them and nobody accesses them outside of path_init()/terminate_walk() segments", which is asking for trouble. With that change we would have nd->path.{mnt,dentry} 1) always valid - NULL or pointing to currently allocated objects. 2) non-NULL while we are successfully walking 3) NULL when we are not walking at all 4) contributing to refcounts whenever non-NULL outside of RCU mode. Fixes: 6c6ec2b0a3e0 ("fs: add support for LOOKUP_CACHED") Reported-by: syzbot+c88a7030da47945a3cc3@syzkaller.appspotmail.com Tested-by: Christian Brauner Signed-off-by: Al Viro --- fs/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 216f16e74351..fc8760d4314e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -579,6 +579,8 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->stack = p->internal; p->dfd = dfd; p->name = name; + p->path.mnt = NULL; + p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; @@ -652,6 +654,8 @@ static void terminate_walk(struct nameidata *nd) rcu_read_unlock(); } nd->depth = 0; + nd->path.mnt = NULL; + nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ @@ -2322,8 +2326,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } nd->root.mnt = NULL; - nd->path.mnt = NULL; - nd->path.dentry = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { -- cgit v1.2.3 From 4f0ed93fb92d3528c73c80317509df3f800a222b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 6 Apr 2021 19:46:51 -0400 Subject: LOOKUP_MOUNTPOINT: we are cleaning "jumped" flag too late That (and traversals in case of umount .) should be done before complete_walk(). Either a braino or mismerge damage on queue reorders - either way, I should've spotted that much earlier. Fucked-up-by: Al Viro X-Paperbag: Brown Fixes: 161aff1d93ab "LOOKUP_MOUNTPOINT: fold path_mountpointat() into path_lookupat()" Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Al Viro --- fs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index fc8760d4314e..48a2f288e802 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2421,16 +2421,16 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; + if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { + err = handle_lookup_down(nd); + nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; - if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { - err = handle_lookup_down(nd); - nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... - } if (!err) { *path = nd->path; nd->path.mnt = NULL; -- cgit v1.2.3 From 4e456b30f78c429b183db420e23b26cde7e03a78 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 31 Mar 2021 14:35:24 +0000 Subject: cifs: On cifs_reconnect, resolve the hostname again. On cifs_reconnect, make sure that DNS resolution happens again. It could be the cause of connection to go dead in the first place. This also contains the fix for a build issue identified by Intel bot. Reported-by: kernel test robot Signed-off-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (SUSE) Reviewed-by: Pavel Shilovsky CC: # 5.11+ Signed-off-by: Steve French --- fs/cifs/Kconfig | 3 +-- fs/cifs/Makefile | 5 +++-- fs/cifs/connect.c | 17 ++++++++++++++++- 3 files changed, 20 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index fe03cbdae959..bf52e9326ebe 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -18,6 +18,7 @@ config CIFS select CRYPTO_AES select CRYPTO_LIB_DES select KEYS + select DNS_RESOLVER help This is the client VFS module for the SMB3 family of NAS protocols, (including support for the most recent, most secure dialect SMB3.1.1) @@ -112,7 +113,6 @@ config CIFS_WEAK_PW_HASH config CIFS_UPCALL bool "Kerberos/SPNEGO advanced session setup" depends on CIFS - select DNS_RESOLVER help Enables an upcall mechanism for CIFS which accesses userspace helper utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets @@ -179,7 +179,6 @@ config CIFS_DEBUG_DUMP_KEYS config CIFS_DFS_UPCALL bool "DFS feature support" depends on CIFS - select DNS_RESOLVER help Distributed File System (DFS) support is used to access shares transparently in an enterprise name space, even if the share diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 5213b20843b5..3ee3b7de4ded 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -10,13 +10,14 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ - smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o + smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ + dns_resolve.o cifs-$(CONFIG_CIFS_XATTR) += xattr.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o -cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o +cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index eec8a2052da2..24668eb006c6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -87,7 +87,6 @@ static void cifs_prune_tlinks(struct work_struct *work); * * This should be called with server->srv_mutex held. */ -#ifdef CONFIG_CIFS_DFS_UPCALL static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) { int rc; @@ -124,6 +123,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) return !rc ? -1 : 0; } +#ifdef CONFIG_CIFS_DFS_UPCALL /* These functions must be called with server->srv_mutex held */ static void reconn_set_next_dfs_target(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb, @@ -321,14 +321,29 @@ cifs_reconnect(struct TCP_Server_Info *server) #endif #ifdef CONFIG_CIFS_DFS_UPCALL + if (cifs_sb && cifs_sb->origin_fullpath) /* * Set up next DFS target server (if any) for reconnect. If DFS * feature is disabled, then we will retry last server we * connected to before. */ reconn_set_next_dfs_target(server, cifs_sb, &tgt_list, &tgt_it); + else { +#endif + /* + * Resolve the hostname again to make sure that IP address is up-to-date. + */ + rc = reconn_set_ipaddr_from_hostname(server); + if (rc) { + cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", + __func__, rc); + } + +#ifdef CONFIG_CIFS_DFS_UPCALL + } #endif + #ifdef CONFIG_CIFS_SWN_UPCALL } #endif -- cgit v1.2.3 From d135be0a7fb83f4dd68721b3355fec6de686834c Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 1 Apr 2021 15:51:17 +0800 Subject: fs: cifs: Remove unnecessary struct declaration struct cifs_readdata is declared twice. One is declared at 208th line. And struct cifs_readdata is defined blew. The declaration here is not needed. Remove the duplicate. Signed-off-by: Wan Jiabing Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 67c056a9a519..ec824ab8c5ca 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1283,8 +1283,6 @@ struct cifs_aio_ctx { bool direct_io; }; -struct cifs_readdata; - /* asynchronous read support */ struct cifs_readdata { struct kref refcount; -- cgit v1.2.3 From 0fc9322ab5e1fe6910c9673e1a7ff29f7dd72611 Mon Sep 17 00:00:00 2001 From: Maciek Borzecki Date: Tue, 6 Apr 2021 17:02:29 +0200 Subject: cifs: escape spaces in share names Commit 653a5efb849a ("cifs: update super_operations to show_devname") introduced the display of devname for cifs mounts. However, when mounting a share which has a whitespace in the name, that exact share name is also displayed in mountinfo. Make sure that all whitespace is escaped. Signed-off-by: Maciek Borzecki CC: # 5.11+ Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 099ad9f3660b..5ddd20b62484 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -476,7 +476,8 @@ static int cifs_show_devname(struct seq_file *m, struct dentry *root) seq_puts(m, "none"); else { convert_delimiter(devname, '/'); - seq_puts(m, devname); + /* escape all spaces in share names */ + seq_escape(m, devname, " \t"); kfree(devname); } return 0; -- cgit v1.2.3 From 6ad7f2332e84c46f0c94e73e05b5b7c2bc1a6b74 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 8 Apr 2021 01:54:39 +0100 Subject: io_uring: clear F_REISSUE right after getting it There are lots of ways r/w request may continue its path after getting REQ_F_REISSUE, it's not necessarily io-wq and can be, e.g. apoll, and submitted via io_async_task_func() -> __io_req_task_submit() Clear the flag right after getting it, so the next attempt is well prepared regardless how the request will be executed. Fixes: 230d50d448ac ("io_uring: move reissue into regular IO path") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/11dcead939343f4e27cab0074d34afcab771bfa4.1617842918.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 65a17d560a73..f1881ac0744b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3294,6 +3294,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(req, iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { + req->flags &= ~REQ_F_REISSUE; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; @@ -3417,8 +3418,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) + if (req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; ret2 = -EAGAIN; + } /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just @@ -6173,7 +6176,6 @@ static void io_wq_submit_work(struct io_wq_work *work) ret = -ECANCELED; if (!ret) { - req->flags &= ~REQ_F_REISSUE; do { ret = io_issue_sqe(req, 0); /* -- cgit v1.2.3 From 9728463737db027557e8ba315cbbca6b81122c04 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 8 Apr 2021 19:28:03 +0100 Subject: io_uring: fix rw req completion WARNING: at fs/io_uring.c:8578 io_ring_exit_work.cold+0x0/0x18 As reissuing is now passed back by REQ_F_REISSUE and kiocb_done() internally uses __io_complete_rw(), it may stop after setting the flag so leaving a dangling request. There are tricky edge cases, e.g. reading beyound file, boundary, so the easiest way is to hand code reissue in kiocb_done() as __io_complete_rw() was doing for us before. Fixes: 230d50d448ac ("io_uring: move reissue into regular IO path") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f602250d292f8a84cca9a01d747744d1e797be26.1617842918.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index f1881ac0744b..bd14327c8e7e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2762,6 +2762,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_async_rw *io = req->async_data; + bool check_reissue = kiocb->ki_complete == io_complete_rw; /* add previously done IO, if any */ if (io && io->bytes_done > 0) { @@ -2777,6 +2778,18 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, __io_complete_rw(req, ret, 0, issue_flags); else io_rw_done(kiocb, ret); + + if (check_reissue && req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + if (!io_rw_reissue(req)) { + int cflags = 0; + + req_set_fail_links(req); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_rw_kbuf(req); + __io_req_complete(req, issue_flags, ret, cflags); + } + } } static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) -- cgit v1.2.3 From c60eb049f4a19ddddcd3ee97a9c79ab8066a6a03 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 8 Apr 2021 01:54:42 +0100 Subject: io-wq: cancel unbounded works on io-wq destroy WARNING: CPU: 5 PID: 227 at fs/io_uring.c:8578 io_ring_exit_work+0xe6/0x470 RIP: 0010:io_ring_exit_work+0xe6/0x470 Call Trace: process_one_work+0x206/0x400 worker_thread+0x4a/0x3d0 kthread+0x129/0x170 ret_from_fork+0x22/0x30 INFO: task lfs-openat:2359 blocked for more than 245 seconds. task:lfs-openat state:D stack: 0 pid: 2359 ppid: 1 flags:0x00000004 Call Trace: ... wait_for_completion+0x8b/0xf0 io_wq_destroy_manager+0x24/0x60 io_wq_put_and_exit+0x18/0x30 io_uring_clean_tctx+0x76/0xa0 __io_uring_files_cancel+0x1b9/0x2e0 do_exit+0xc0/0xb40 ... Even after io-wq destroy has been issued io-wq worker threads will continue executing all left work items as usual, and may hang waiting for I/O that won't ever complete (aka unbounded). [<0>] pipe_read+0x306/0x450 [<0>] io_iter_do_read+0x1e/0x40 [<0>] io_read+0xd5/0x330 [<0>] io_issue_sqe+0xd21/0x18a0 [<0>] io_wq_submit_work+0x6c/0x140 [<0>] io_worker_handle_work+0x17d/0x400 [<0>] io_wqe_worker+0x2c0/0x330 [<0>] ret_from_fork+0x22/0x30 Cancel all unbounded I/O instead of executing them. This changes the user visible behaviour, but that's inevitable as io-wq is not per task. Suggested-by: Jens Axboe Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cd4b543154154cba055cf86f351441c2174d7f71.1617842918.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/io-wq.c b/fs/io-wq.c index 433c4d3c3c1c..4eba531bea5a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -415,6 +415,7 @@ static void io_worker_handle_work(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; + bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { struct io_wq_work *work; @@ -444,6 +445,9 @@ get_next: unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); + + if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) + work->flags |= IO_WQ_WORK_CANCEL; wq->do_work(work); io_assign_current_work(worker, NULL); -- cgit v1.2.3 From 90bd070aae6c4fb5d302f9c4b9c88be60c8197ec Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 9 Apr 2021 13:27:29 -0700 Subject: ocfs2: fix deadlock between setattr and dio_end_io_write The following deadlock is detected: truncate -> setattr path is waiting for pending direct IO to be done (inode->i_dio_count become zero) with inode->i_rwsem held (down_write). PID: 14827 TASK: ffff881686a9af80 CPU: 20 COMMAND: "ora_p005_hrltd9" #0 __schedule at ffffffff818667cc #1 schedule at ffffffff81866de6 #2 inode_dio_wait at ffffffff812a2d04 #3 ocfs2_setattr at ffffffffc05f322e [ocfs2] #4 notify_change at ffffffff812a5a09 #5 do_truncate at ffffffff812808f5 #6 do_sys_ftruncate.constprop.18 at ffffffff81280cf2 #7 sys_ftruncate at ffffffff81280d8e #8 do_syscall_64 at ffffffff81003949 #9 entry_SYSCALL_64_after_hwframe at ffffffff81a001ad dio completion path is going to complete one direct IO (decrement inode->i_dio_count), but before that it hung at locking inode->i_rwsem: #0 __schedule+700 at ffffffff818667cc #1 schedule+54 at ffffffff81866de6 #2 rwsem_down_write_failed+536 at ffffffff8186aa28 #3 call_rwsem_down_write_failed+23 at ffffffff8185a1b7 #4 down_write+45 at ffffffff81869c9d #5 ocfs2_dio_end_io_write+180 at ffffffffc05d5444 [ocfs2] #6 ocfs2_dio_end_io+85 at ffffffffc05d5a85 [ocfs2] #7 dio_complete+140 at ffffffff812c873c #8 dio_aio_complete_work+25 at ffffffff812c89f9 #9 process_one_work+361 at ffffffff810b1889 #10 worker_thread+77 at ffffffff810b233d #11 kthread+261 at ffffffff810b7fd5 #12 ret_from_fork+62 at ffffffff81a0035e Thus above forms ABBA deadlock. The same deadlock was mentioned in upstream commit 28f5a8a7c033 ("ocfs2: should wait dio before inode lock in ocfs2_setattr()"). It seems that that commit only removed the cluster lock (the victim of above dead lock) from the ABBA deadlock party. End-user visible effects: Process hang in truncate -> ocfs2_setattr path and other processes hang at ocfs2_dio_end_io_write path. This is to fix the deadlock itself. It removes inode_lock() call from dio completion path to remove the deadlock and add ip_alloc_sem lock in setattr path to synchronize the inode modifications. [wen.gang.wang@oracle.com: remove the "had_alloc_lock" as suggested] Link: https://lkml.kernel.org/r/20210402171344.1605-1-wen.gang.wang@oracle.com Link: https://lkml.kernel.org/r/20210331203654.3911-1-wen.gang.wang@oracle.com Signed-off-by: Wengang Wang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 11 +---------- fs/ocfs2/file.c | 8 ++++++-- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3bfb4147895a..ad20403b383f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2295,7 +2295,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; loff_t end = offset + bytes; - int ret = 0, credits = 0, locked = 0; + int ret = 0, credits = 0; ocfs2_init_dealloc_ctxt(&dealloc); @@ -2306,13 +2306,6 @@ static int ocfs2_dio_end_io_write(struct inode *inode, !dwc->dw_orphaned) goto out; - /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we - * are in that context. */ - if (dwc->dw_writer_pid != task_pid_nr(current)) { - inode_lock(inode); - locked = 1; - } - ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); @@ -2393,8 +2386,6 @@ out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); ocfs2_run_deallocs(osb, &dealloc); - if (locked) - inode_unlock(inode); ocfs2_dio_free_write_ctx(inode, dwc); return ret; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6611c64ca0be..5edc1d0cf115 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1245,22 +1245,24 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto bail_unlock; } } + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + 2 * ocfs2_quota_trans_credits(sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } status = __dquot_transfer(inode, transfer_to); if (status < 0) goto bail_commit; } else { + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } } @@ -1273,6 +1275,8 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, bail_commit: ocfs2_commit_trans(osb, handle); +bail_unlock_alloc: + up_write(&OCFS2_I(inode)->ip_alloc_sem); bail_unlock: if (status && inode_locked) { ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); -- cgit v1.2.3 From df41872b68601059dd4a84858952dcae58acd331 Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Fri, 9 Apr 2021 13:27:35 -0700 Subject: fs: direct-io: fix missing sdio->boundary I encountered a hung task issue, but not a performance one. I run DIO on a device (need lba continuous, for example open channel ssd), maybe hungtask in below case: DIO: Checkpoint: get addr A(at boundary), merge into BIO, no submit because boundary missing flush dirty data(get addr A+1), wait IO(A+1) writeback timeout, because DIO(A) didn't submit get addr A+2 fail, because checkpoint is doing dio_send_cur_page() may clear sdio->boundary, so prevent it from missing a boundary. Link: https://lkml.kernel.org/r/20210322042253.38312-1-jack.qiu@huawei.com Fixes: b1058b981272 ("direct-io: submit bio after boundary buffer is added to it") Signed-off-by: Jack Qiu Reviewed-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index b61491bf3166..b2e86e739d7a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -812,6 +812,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, struct buffer_head *map_bh) { int ret = 0; + int boundary = sdio->boundary; /* dio_send_cur_page may clear it */ if (dio->op == REQ_OP_WRITE) { /* @@ -850,10 +851,10 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: /* - * If sdio->boundary then we want to schedule the IO now to + * If boundary then we want to schedule the IO now to * avoid metadata seeks. */ - if (sdio->boundary) { + if (boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); if (sdio->bio) dio_bio_submit(dio, sdio); -- cgit v1.2.3 From 53b74fa990bf76f290aa5930abfcf37424a1a865 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 8 Apr 2021 17:25:28 +0900 Subject: btrfs: zoned: move superblock logging zone location Moves the location of the superblock logging zones. The new locations of the logging zones are now determined based on fixed block addresses instead of on fixed zone numbers. The old placement method based on fixed zone numbers causes problems when one needs to inspect a file system image without access to the drive zone information. In such case, the super block locations cannot be reliably determined as the zone size is unknown. By locating the superblock logging zones using fixed addresses, we can scan a dumped file system image without the zone information since a super block copy will always be present at or after the fixed known locations. Introduce the following three pairs of zones containing fixed offset locations, regardless of the device zone size. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If a logging zone is outside of the disk capacity, we do not record the superblock copy. The first copy position is much larger than for a non-zoned filesystem, which is at 64M. This is to avoid overlapping with the log zones for the primary superblock. This higher location is arbitrary but allows supporting devices with very large zone sizes, plus some space around in between. Such large zone size is unrealistic and very unlikely to ever be seen in real devices. Currently, SMR disks have a zone size of 256MB, and we are expecting ZNS drives to be in the 1-4GB range, so this limit gives us room to breathe. For now, we only allow zone sizes up to 8GB. The maximum zone size that would still fit in the space is 256G. The fixed location addresses are somewhat arbitrary, with the intent of maintaining superblock reliability for smaller and larger devices, with the preference for the latter. For this reason, there are two superblocks under the first 1T. This should cover use cases for physical devices and for emulated/device-mapper devices. The superblock logging zones are reserved for superblock logging and never used for data or metadata blocks. Note that we only reserve the two zones per primary/copy actually used for superblock logging. We do not reserve the ranges of zones possibly containing superblocks with the largest supported zone size (0-16GB, 512G-528GB, 4096G-4112G). The zones containing the fixed location offsets used to store superblocks on a non-zoned volume are also reserved to avoid confusion. Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 53 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 43948bd40e02..ba7a303300a3 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -21,9 +21,30 @@ /* Pseudo write pointer value for conventional zone */ #define WP_CONVENTIONAL ((u64)-2) +/* + * Location of the first zone of superblock logging zone pairs. + * + * - primary superblock: 0B (zone 0) + * - first copy: 512G (zone starting at that offset) + * - second copy: 4T (zone starting at that offset) + */ +#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) +#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) +#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) + +#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) +#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) + /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 +/* + * Maximum supported zone size. Currently, SMR disks have a zone size of + * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not + * expect the zone size to become larger than 8GiB in the near future. + */ +#define BTRFS_MAX_ZONE_SIZE SZ_8G + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; @@ -111,23 +132,22 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, } /* - * The following zones are reserved as the circular buffer on ZONED btrfs. - * - The primary superblock: zones 0 and 1 - * - The first copy: zones 16 and 17 - * - The second copy: zones 1024 or zone at 256GB which is minimum, and - * the following one + * Get the first zone number of the superblock mirror */ static inline u32 sb_zone_number(int shift, int mirror) { - ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + u64 zone; + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { - case 0: return 0; - case 1: return 16; - case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024); + case 0: zone = 0; break; + case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; + case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } - return 0; + ASSERT(zone <= U32_MAX); + + return (u32)zone; } /* @@ -300,10 +320,21 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) zone_sectors = bdev_zone_sectors(bdev); } - nr_sectors = bdev_nr_sectors(bdev); /* Check if it's power of 2 (see is_power_of_2) */ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; + + /* We reject devices with a zone size larger than 8GB */ + if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu larger than supported maximum %llu", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); + ret = -EINVAL; + goto out; + } + + nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->max_zone_append_size = (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; -- cgit v1.2.3 From c7d95613c7d6e003969722a290397b8271bdad17 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 13 Apr 2021 11:43:00 +0100 Subject: io_uring: fix early sqd_list removal sqpoll hangs [ 245.463317] INFO: task iou-sqp-1374:1377 blocked for more than 122 seconds. [ 245.463334] task:iou-sqp-1374 state:D flags:0x00004000 [ 245.463345] Call Trace: [ 245.463352] __schedule+0x36b/0x950 [ 245.463376] schedule+0x68/0xe0 [ 245.463385] __io_uring_cancel+0xfb/0x1a0 [ 245.463407] do_exit+0xc0/0xb40 [ 245.463423] io_sq_thread+0x49b/0x710 [ 245.463445] ret_from_fork+0x22/0x30 It happens when sqpoll forgot to run park_task_work and goes to exit, then exiting user may remove ctx from sqd_list, and so corresponding io_sq_thread() -> io_uring_cancel_sqpoll() won't be executed. Hopefully it just stucks in do_exit() in this case. Fixes: dbe1bdbb39db ("io_uring: handle signals for IO threads like a normal thread") Reported-by: Joakim Hassila Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index bd14327c8e7e..dff34975d86b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6754,6 +6754,9 @@ static int io_sq_thread(void *data) current->flags |= PF_NO_SETAFFINITY; mutex_lock(&sqd->lock); + /* a user may had exited before the thread started */ + io_run_task_work_head(&sqd->park_task_work); + while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { int ret; bool cap_entries, sqt_spin, needs_sched; @@ -6770,10 +6773,10 @@ static int io_sq_thread(void *data) } cond_resched(); mutex_lock(&sqd->lock); - if (did_sig) - break; io_run_task_work(); io_run_task_work_head(&sqd->park_task_work); + if (did_sig) + break; timeout = jiffies + sqd->sq_thread_idle; continue; } -- cgit v1.2.3 From 0c93ac69407d63a85be0129aa55ffaec27ffebd3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 17 Apr 2021 09:27:04 -0700 Subject: readdir: make sure to verify directory entry for legacy interfaces too This does the directory entry name verification for the legacy "fillonedir" (and compat) interface that goes all the way back to the dark ages before we had a proper dirent, and the readdir() system call returned just a single entry at a time. Nobody should use this interface unless you still have binaries from 1991, but let's do it right. This came up during discussions about unsafe_copy_to_user() and proper checking of all the inputs to it, as the networking layer is looking to use it in a few new places. So let's make sure the _old_ users do it all right and proper, before we add new ones. See also commit 8a23eb804ca4 ("Make filldir[64]() verify the directory entry filename is valid") which did the proper modern interfaces that people actually use. It had a note: Note that I didn't bother adding the checks to any legacy interfaces that nobody uses. which this now corrects. Note that we really don't care about POSIX and the presense of '/' in a directory entry, but verify_dirent_name() also ends up doing the proper name length verification which is what the input checking discussion was about. [ Another option would be to remove the support for this particular very old interface: any binaries that use it are likely a.out binaries, and they will no longer run anyway since we removed a.out binftm support in commit eac616557050 ("x86: Deprecate a.out support"). But I'm not sure which came first: getdents() or ELF support, so let's pretend somebody might still have a working binary that uses the legacy readdir() case.. ] Link: https://lore.kernel.org/lkml/CAHk-=wjbvzCAhAtvG0d81W5o0-KT5PPTHhfJ5ieDFq+bGtgOYg@mail.gmail.com/ Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/readdir.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/readdir.c b/fs/readdir.c index 19434b3c982c..09e8ed7d4161 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -150,6 +150,9 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, if (buf->result) return -EINVAL; + buf->result = verify_dirent_name(name, namlen); + if (buf->result < 0) + return buf->result; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; @@ -405,6 +408,9 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, if (buf->result) return -EINVAL; + buf->result = verify_dirent_name(name, namlen); + if (buf->result < 0) + return buf->result; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; -- cgit v1.2.3