diff options
Diffstat (limited to 'fs')
71 files changed, 778 insertions, 411 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index a4e9e6e07e93..d3c6bb22c5f4 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -322,6 +322,8 @@ static int afs_deliver_cb_callback(struct afs_call *call) return ret; call->unmarshall++; + fallthrough; + case 5: break; } @@ -418,6 +420,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -530,6 +533,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -663,6 +667,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; + fallthrough; case 3: break; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 9fbe5a5ec9bd..78719f2f567e 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1919,7 +1919,9 @@ static void afs_rename_edit_dir(struct afs_operation *op) new_inode = d_inode(new_dentry); if (new_inode) { spin_lock(&new_inode->i_lock); - if (new_inode->i_nlink > 0) + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else if (new_inode->i_nlink > 0) drop_nlink(new_inode); spin_unlock(&new_inode->i_lock); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 2f695a260442..dd3f45d906d2 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -388,6 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->file_size = vp->scb.status.size; call->unmarshall++; + fallthrough; case 5: break; @@ -1408,6 +1409,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) _debug("motd '%s'", p); call->unmarshall++; + fallthrough; case 8: break; @@ -1845,6 +1847,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 6: break; @@ -1979,6 +1982,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 4: break; diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index dc9327332f06..00fca3c66ba6 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -593,6 +593,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (ret < 0) return ret; call->unmarshall = 6; + fallthrough; case 6: break; diff --git a/fs/block_dev.c b/fs/block_dev.c index b8abccd03e5d..6cc4d4cfe0c2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1244,6 +1244,9 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + rescan: if (bdev->bd_part_count) return -EBUSY; @@ -1298,6 +1301,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) struct gendisk *disk = bdev->bd_disk; int ret = 0; + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + if (!bdev->bd_openers) { if (!bdev_is_partition(bdev)) { ret = 0; @@ -1332,8 +1338,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) whole->bd_part_count++; mutex_unlock(&whole->bd_mutex); - if (!(disk->flags & GENHD_FL_UP) || - !bdev_nr_sectors(bdev)) { + if (!bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); return -ENXIO; @@ -1364,16 +1369,12 @@ struct block_device *blkdev_get_no_open(dev_t dev) struct block_device *bdev; struct gendisk *disk; - down_read(&bdev_lookup_sem); bdev = bdget(dev); if (!bdev) { - up_read(&bdev_lookup_sem); blk_request_module(dev); - down_read(&bdev_lookup_sem); - bdev = bdget(dev); if (!bdev) - goto unlock; + return NULL; } disk = bdev->bd_disk; @@ -1383,14 +1384,11 @@ struct block_device *blkdev_get_no_open(dev_t dev) goto put_disk; if (!try_module_get(bdev->bd_disk->fops->owner)) goto put_disk; - up_read(&bdev_lookup_sem); return bdev; put_disk: put_disk(disk); bdput: bdput(bdev); -unlock: - up_read(&bdev_lookup_sem); return NULL; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2bea01d23a5b..d17ac301032e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -28,6 +28,7 @@ #include "compression.h" #include "extent_io.h" #include "extent_map.h" +#include "zoned.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -349,6 +350,7 @@ static void end_compressed_bio_write(struct bio *bio) */ inode = cb->inode; cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + btrfs_record_physical_zoned(inode, cb->start, bio); btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], cb->start, cb->start + cb->len - 1, bio->bi_status == BLK_STS_OK); @@ -401,6 +403,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, u64 first_byte = disk_start; blk_status_t ret; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; + const bool use_append = btrfs_use_zone_append(inode, disk_start); + const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -418,10 +422,31 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->nr_pages = nr_pages; bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = REQ_OP_WRITE | write_flags; + bio->bi_opf = bio_op | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (use_append) { + struct extent_map *em; + struct map_lookup *map; + struct block_device *bdev; + + em = btrfs_get_chunk_map(fs_info, disk_start, PAGE_SIZE); + if (IS_ERR(em)) { + kfree(cb); + bio_put(bio); + return BLK_STS_NOTSUPP; + } + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + bdev = map->stripes[0].dev->bdev; + + bio_set_dev(bio, bdev); + free_extent_map(em); + } + if (blkcg_css) { bio->bi_opf |= REQ_CGROUP_PUNT; kthread_associate_blkcg(blkcg_css); @@ -432,6 +457,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { int submit = 0; + int len; page = compressed_pages[pg_index]; page->mapping = inode->vfs_inode.i_mapping; @@ -439,9 +465,13 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, 0); + if (pg_index == 0 && use_append) + len = bio_add_zone_append_page(bio, page, PAGE_SIZE, 0); + else + len = bio_add_page(bio, page, PAGE_SIZE, 0); + page->mapping = NULL; - if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (submit || len < PAGE_SIZE) { /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -465,11 +495,15 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, } bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = REQ_OP_WRITE | write_flags; + bio->bi_opf = bio_op | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; + /* + * Use bio_add_page() to ensure the bio has at least one + * page. + */ bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f83fd3cbf243..9fb76829a281 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3127,7 +3127,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_snapshot(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7a28314189b4..f1d15b68994a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1340,12 +1340,16 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; + struct btrfs_device *device = stripe->dev; - if (!stripe->dev->bdev) { + if (!device->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + continue; + ret = do_discard_extent(stripe, &bytes); if (!ret) { discarded_bytes += bytes; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 074a78a202b8..dee2dafbc872 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3753,7 +3753,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, /* Note that em_end from extent_map_end() is exclusive */ iosize = min(em_end, end + 1) - cur; - if (btrfs_use_zone_append(inode, em)) + if (btrfs_use_zone_append(inode, em->block_start)) opf = REQ_OP_ZONE_APPEND; free_extent_map(em); @@ -5196,7 +5196,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; - u64 off = start; + u64 off; u64 max = start + len; u32 flags = 0; u32 found_type; @@ -5231,6 +5231,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto out_free_ulist; } + /* + * We can't initialize that to 'start' as this could miss extents due + * to extent item merging + */ + off = 0; start = round_down(start, btrfs_inode_sectorsize(inode)); len = round_up(max, btrfs_inode_sectorsize(inode)) - start; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 864c08d08a35..3b10d98b4ebb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2067,6 +2067,30 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) return ret; } +static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = BTRFS_I(ctx->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (btrfs_inode_in_log(inode, fs_info->generation) && + list_empty(&ctx->ordered_extents)) + return true; + + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ + if (inode->last_trans <= fs_info->last_trans_committed && + (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || + list_empty(&ctx->ordered_extents))) + return true; + + return false; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -2185,17 +2209,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); - /* - * If we are doing a fast fsync we can not bail out if the inode's - * last_trans is <= then the last committed transaction, because we only - * update the last_trans of the inode during ordered extent completion, - * and for a fast fsync we don't wait for that, we only wait for the - * writeback to complete. - */ smp_mb(); - if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && - (full_sync || list_empty(&ctx.ordered_extents)))) { + if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e54466fc101f..4806295116d8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3949,7 +3949,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *block_group; struct rb_node *node; - int ret; + int ret = 0; btrfs_info(fs_info, "cleaning free space cache v1"); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4af336008b12..33f14573f2ec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3241,6 +3241,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); + cond_resched_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); } @@ -7785,7 +7786,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->bdev = fs_info->fs_devices->latest_bdev; iomap->length = len; - if (write && btrfs_use_zone_append(BTRFS_I(inode), em)) + if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) iomap->flags |= IOMAP_F_ZONE_APPEND; free_extent_map(em); @@ -9678,7 +9679,7 @@ out: return ret; } -int btrfs_start_delalloc_snapshot(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -9691,7 +9692,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - return start_delalloc_inodes(root, &wbc, true, false); + return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ee1dbabb5d3c..5dc2fd843ae3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -259,6 +259,8 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, if (!fa->flags_valid) { /* 1 item for the inode */ trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); goto update_flags; } @@ -907,7 +909,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, */ btrfs_drew_read_lock(&root->snapshot_lock); - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) goto out; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 07b0b4218791..6c413bb451a3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -984,7 +984,7 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, if (pre) ret = clone_ordered_extent(ordered, 0, pre); - if (post) + if (ret == 0 && post) ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, post); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2319c923c9e6..3ded812f522c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3545,11 +3545,15 @@ static int try_flush_qgroup(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - /* Can't hold an open transaction or we run the risk of deadlocking */ - ASSERT(current->journal_info == NULL || - current->journal_info == BTRFS_SEND_TRANS_STUB); - if (WARN_ON(current->journal_info && - current->journal_info != BTRFS_SEND_TRANS_STUB)) + /* + * Can't hold an open transaction or we run the risk of deadlocking, + * and can't either be under the context of a send operation (where + * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that + * would result in a crash when starting a transaction and does not + * make sense either (send is a read-only operation). + */ + ASSERT(current->journal_info == NULL); + if (WARN_ON(current->journal_info)) return 0; /* @@ -3562,7 +3566,7 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 3928ecc40d7b..d434dc78dadf 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -282,6 +282,11 @@ copy_inline_extent: out: if (!ret && !trans) { /* + * Release path before starting a new transaction so we don't + * hold locks that would confuse lockdep. + */ + btrfs_release_path(path); + /* * No transaction here means we copied the inline extent into a * page of the destination inode. * diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 55741adf9071..bd69db72acc5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7170,7 +7170,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) int i; if (root) { - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); @@ -7178,7 +7178,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) for (i = 0; i < sctx->clone_roots_cnt; i++) { root = sctx->clone_roots[i].root; - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f67721d82e5d..326be57f2828 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1858,8 +1858,6 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; - } else { - BUG(); /* Logic Error */ } iput(inode); @@ -6061,7 +6059,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * (since logging them is pointless, a link count of 0 means they * will never be accessible). */ - if (btrfs_inode_in_log(inode, trans->transid) || + if ((btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) || inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; @@ -6462,6 +6461,24 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, (!old_dir || old_dir->logged_trans < trans->transid)) return; + /* + * If we are doing a rename (old_dir is not NULL) from a directory that + * was previously logged, make sure the next log attempt on the directory + * is not skipped and logs the inode again. This is because the log may + * not currently be authoritative for a range including the old + * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make + * sure after a log replay we do not end up with both the new and old + * dentries around (in case the inode is a directory we would have a + * directory with two hard links and 2 inode references for different + * parents). The next log attempt of old_dir will happen at + * btrfs_log_all_parents(), called through btrfs_log_inode_parent() + * below, because we have previously set inode->last_unlink_trans to the + * current transaction ID, either here or at btrfs_record_unlink_dir() in + * case inode is a directory. + */ + if (old_dir) + old_dir->logged_trans = 0; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9a1ead0c4a31..47d27059d064 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1459,7 +1459,7 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, /* Given hole range was invalid (outside of device) */ if (ret == -ERANGE) { *hole_start += *hole_size; - *hole_size = false; + *hole_size = 0; return true; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 70b23a0d03b1..1bb8ee97aae0 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1126,6 +1126,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + ret = -EIO; + goto out; + } + switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: @@ -1273,7 +1278,7 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) spin_unlock(&trans->releasing_ebs_lock); } -bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) +bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; @@ -1288,7 +1293,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) if (!is_data_inode(&inode->vfs_inode)) return false; - cache = btrfs_lookup_block_group(fs_info, em->block_start); + cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache) return false; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 5e41a74a9cb2..e55d32595c2c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -53,7 +53,7 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); -bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); +bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, struct bio *bio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); @@ -152,8 +152,7 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } -static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, - struct extent_map *em) +static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) { return false; } diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 4a97fe12006b..37fc7d6ac457 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -72,15 +72,28 @@ struct smb3_key_debug_info { } __packed; /* - * Dump full key (32 byte encrypt/decrypt keys instead of 16 bytes) - * is needed if GCM256 (stronger encryption) negotiated + * Dump variable-sized keys */ struct smb3_full_key_debug_info { - __u64 Suid; + /* INPUT: size of userspace buffer */ + __u32 in_size; + + /* + * INPUT: 0 for current user, otherwise session to dump + * OUTPUT: session id that was dumped + */ + __u64 session_id; __u16 cipher_type; - __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ - __u8 smb3encryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */ - __u8 smb3decryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */ + __u8 session_key_length; + __u8 server_in_key_length; + __u8 server_out_key_length; + __u8 data[]; + /* + * return this struct with the keys appended at the end: + * __u8 session_key[session_key_length]; + * __u8 server_in_key[server_in_key_length]; + * __u8 server_out_key[server_out_key_length]; + */ } __packed; struct smb3_notify { diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d7ea9c5fe0f8..2ffcb29d5c8f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -133,7 +133,7 @@ struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; -struct workqueue_struct *deferredclose_wq; +struct workqueue_struct *deferredclose_wq; __u32 cifs_lock_secret; /* diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d88b4b523dcc..8488d7024462 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1257,8 +1257,7 @@ struct cifsFileInfo { struct work_struct oplock_break; /* work for oplock breaks */ struct work_struct put; /* work for the final part of _put */ struct delayed_work deferred; - bool oplock_break_received; /* Flag to indicate oplock break */ - bool deferred_scheduled; + bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ }; struct cifs_io_parms { @@ -1418,6 +1417,7 @@ struct cifsInodeInfo { struct inode vfs_inode; struct list_head deferred_closes; /* list of deferred closes */ spinlock_t deferred_lock; /* protection on deferred list */ + bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ }; static inline struct cifsInodeInfo * diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b53a87db282f..554d64fe171e 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -148,7 +148,8 @@ #define SMB3_SIGN_KEY_SIZE (16) /* - * Size of the smb3 encryption/decryption keys + * Size of the smb3 encryption/decryption key storage. + * This size is big enough to store any cipher key types. */ #define SMB3_ENC_DEC_KEY_SIZE (32) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6caad100c3f3..379a427f3c2f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -323,8 +323,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->dentry = dget(dentry); cfile->f_flags = file->f_flags; cfile->invalidHandle = false; - cfile->oplock_break_received = false; - cfile->deferred_scheduled = false; + cfile->deferred_close_scheduled = false; cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); INIT_WORK(&cfile->put, cifsFileInfo_put_work); @@ -574,21 +573,18 @@ int cifs_open(struct inode *inode, struct file *file) file->f_op = &cifs_file_direct_ops; } - spin_lock(&CIFS_I(inode)->deferred_lock); /* Get the cached handle as SMB2 close is deferred */ rc = cifs_get_readable_path(tcon, full_path, &cfile); if (rc == 0) { if (file->f_flags == cfile->f_flags) { file->private_data = cfile; + spin_lock(&CIFS_I(inode)->deferred_lock); cifs_del_deferred_close(cfile); spin_unlock(&CIFS_I(inode)->deferred_lock); goto out; } else { - spin_unlock(&CIFS_I(inode)->deferred_lock); _cifsFileInfo_put(cfile, true, false); } - } else { - spin_unlock(&CIFS_I(inode)->deferred_lock); } if (server->oplocks) @@ -878,12 +874,8 @@ void smb2_deferred_work_close(struct work_struct *work) struct cifsFileInfo, deferred.work); spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - if (!cfile->deferred_scheduled) { - spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - return; - } cifs_del_deferred_close(cfile); - cfile->deferred_scheduled = false; + cfile->deferred_close_scheduled = false; spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); _cifsFileInfo_put(cfile, true, false); } @@ -900,19 +892,26 @@ int cifs_close(struct inode *inode, struct file *file) file->private_data = NULL; dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && + cinode->lease_granted && dclose) { if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) inode->i_ctime = inode->i_mtime = current_time(inode); spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); - if (cfile->deferred_scheduled) { - mod_delayed_work(deferredclose_wq, - &cfile->deferred, cifs_sb->ctx->acregmax); + if (cfile->deferred_close_scheduled && + delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, + &cfile->deferred, cifs_sb->ctx->acregmax)) + cifsFileInfo_get(cfile); } else { /* Deferred close for files */ queue_delayed_work(deferredclose_wq, &cfile->deferred, cifs_sb->ctx->acregmax); - cfile->deferred_scheduled = true; + cfile->deferred_close_scheduled = true; spin_unlock(&cinode->deferred_lock); return 0; } @@ -2020,8 +2019,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) { - if ((!open_file->invalidHandle) && - (!open_file->oplock_break_received)) { + if ((!open_file->invalidHandle)) { /* found a good file */ /* lock it so it will not be closed on us */ cifsFileInfo_get(open_file); @@ -4874,14 +4872,20 @@ oplock_break_ack: } /* * When oplock break is received and there are no active - * file handles but cached, then set the flag oplock_break_received. + * file handles but cached, then schedule deferred close immediately. * So, new open will not use cached handle. */ spin_lock(&CIFS_I(inode)->deferred_lock); is_deferred = cifs_is_deferred_close(cfile, &dclose); - if (is_deferred && cfile->deferred_scheduled) { - cfile->oplock_break_received = true; - mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); + if (is_deferred && + cfile->deferred_close_scheduled && + delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) + cifsFileInfo_get(cfile); } spin_unlock(&CIFS_I(inode)->deferred_lock); _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 5d21cd905315..92d4ab029c91 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -1145,7 +1145,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, /* if iocharset not set then load_nls_default * is used by caller */ - cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); + cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); break; case Opt_netbiosname: memset(ctx->source_rfc1001_name, 0x20, diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 28ec8d7c521a..d67d281ab863 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -33,6 +33,7 @@ #include "cifsfs.h" #include "cifs_ioctl.h" #include "smb2proto.h" +#include "smb2glob.h" #include <linux/btrfs.h> static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, @@ -214,48 +215,112 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) return 0; } -static int cifs_dump_full_key(struct cifs_tcon *tcon, unsigned long arg) +static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug_info __user *in) { - struct smb3_full_key_debug_info pfull_key_inf; - __u64 suid; - struct list_head *tmp; + struct smb3_full_key_debug_info out; struct cifs_ses *ses; + int rc = 0; bool found = false; + u8 __user *end; - if (!smb3_encryption_required(tcon)) - return -EOPNOTSUPP; + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + goto out; + } + + /* copy user input into our output buffer */ + if (copy_from_user(&out, in, sizeof(out))) { + rc = -EINVAL; + goto out; + } + + if (!out.session_id) { + /* if ses id is 0, use current user session */ + ses = tcon->ses; + } else { + /* otherwise if a session id is given, look for it in all our sessions */ + struct cifs_ses *ses_it = NULL; + struct TCP_Server_Info *server_it = NULL; - ses = tcon->ses; /* default to user id for current user */ - if (get_user(suid, (__u64 __user *)arg)) - suid = 0; - if (suid) { - /* search to see if there is a session with a matching SMB UID */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &tcon->ses->server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); - if (ses->Suid == suid) { - found = true; - break; + list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) { + if (ses_it->Suid == out.session_id) { + ses = ses_it; + /* + * since we are using the session outside the crit + * section, we need to make sure it won't be released + * so increment its refcount + */ + ses->ses_count++; + found = true; + goto search_end; + } } } +search_end: spin_unlock(&cifs_tcp_ses_lock); - if (found == false) - return -EINVAL; - } /* else uses default user's SMB UID (ie current user) */ - - pfull_key_inf.cipher_type = le16_to_cpu(ses->server->cipher_type); - pfull_key_inf.Suid = ses->Suid; - memcpy(pfull_key_inf.auth_key, ses->auth_key.response, - 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); - memcpy(pfull_key_inf.smb3decryptionkey, ses->smb3decryptionkey, - 32 /* SMB3_ENC_DEC_KEY_SIZE */); - memcpy(pfull_key_inf.smb3encryptionkey, - ses->smb3encryptionkey, 32 /* SMB3_ENC_DEC_KEY_SIZE */); - if (copy_to_user((void __user *)arg, &pfull_key_inf, - sizeof(struct smb3_full_key_debug_info))) - return -EFAULT; + if (!found) { + rc = -ENOENT; + goto out; + } + } - return 0; + switch (ses->server->cipher_type) { + case SMB2_ENCRYPTION_AES128_CCM: + case SMB2_ENCRYPTION_AES128_GCM: + out.session_key_length = CIFS_SESS_KEY_SIZE; + out.server_in_key_length = out.server_out_key_length = SMB3_GCM128_CRYPTKEY_SIZE; + break; + case SMB2_ENCRYPTION_AES256_CCM: + case SMB2_ENCRYPTION_AES256_GCM: + out.session_key_length = CIFS_SESS_KEY_SIZE; + out.server_in_key_length = out.server_out_key_length = SMB3_GCM256_CRYPTKEY_SIZE; + break; + default: + rc = -EOPNOTSUPP; + goto out; + } + + /* check if user buffer is big enough to store all the keys */ + if (out.in_size < sizeof(out) + out.session_key_length + out.server_in_key_length + + out.server_out_key_length) { + rc = -ENOBUFS; + goto out; + } + + out.session_id = ses->Suid; + out.cipher_type = le16_to_cpu(ses->server->cipher_type); + + /* overwrite user input with our output */ + if (copy_to_user(in, &out, sizeof(out))) { + rc = -EINVAL; + goto out; + } + + /* append all the keys at the end of the user buffer */ + end = in->data; + if (copy_to_user(end, ses->auth_key.response, out.session_key_length)) { + rc = -EINVAL; + goto out; + } + end += out.session_key_length; + + if (copy_to_user(end, ses->smb3encryptionkey, out.server_in_key_length)) { + rc = -EINVAL; + goto out; + } + end += out.server_in_key_length; + + if (copy_to_user(end, ses->smb3decryptionkey, out.server_out_key_length)) { + rc = -EINVAL; + goto out; + } + +out: + if (found) + cifs_put_smb_ses(ses); + return rc; } long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) @@ -371,6 +436,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; break; case CIFS_DUMP_KEY: + /* + * Dump encryption keys. This is an old ioctl that only + * handles AES-128-{CCM,GCM}. + */ if (pSMBFile == NULL) break; if (!capable(CAP_SYS_ADMIN)) { @@ -398,11 +467,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = 0; break; - /* - * Dump full key (32 bytes instead of 16 bytes) is - * needed if GCM256 (stronger encryption) negotiated - */ case CIFS_DUMP_FULL_KEY: + /* + * Dump encryption keys (handles any key sizes) + */ if (pSMBFile == NULL) break; if (!capable(CAP_SYS_ADMIN)) { @@ -410,8 +478,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; } tcon = tlink_tcon(pSMBFile->tlink); - rc = cifs_dump_full_key(tcon, arg); - + rc = cifs_dump_full_key(tcon, (void __user *)arg); break; case CIFS_IOC_NOTIFY: if (!S_ISDIR(inode->i_mode)) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 524dbdfb7184..7207a63819cb 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -672,6 +672,11 @@ cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } +/* + * Critical section which runs after acquiring deferred_lock. + * As there is no reference count on cifs_deferred_close, pdclose + * should not be used outside deferred_lock. + */ bool cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **pdclose) { @@ -688,6 +693,9 @@ cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close ** return false; } +/* + * Critical section which runs after acquiring deferred_lock. + */ void cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *dclose) { @@ -707,6 +715,9 @@ cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close * list_add_tail(&dclose->dlist, &CIFS_I(d_inode(cfile->dentry))->deferred_closes); } +/* + * Critical section which runs after acquiring deferred_lock. + */ void cifs_del_deferred_close(struct cifsFileInfo *cfile) { @@ -738,15 +749,19 @@ void cifs_close_all_deferred_files(struct cifs_tcon *tcon) { struct cifsFileInfo *cfile; - struct cifsInodeInfo *cinode; struct list_head *tmp; spin_lock(&tcon->open_file_lock); list_for_each(tmp, &tcon->openFileList) { cfile = list_entry(tmp, struct cifsFileInfo, tlist); - cinode = CIFS_I(d_inode(cfile->dentry)); - if (delayed_work_pending(&cfile->deferred)) - mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); + if (delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) + cifsFileInfo_get(cfile); + } } spin_unlock(&tcon->open_file_lock); } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index dd0eb665b680..21ef51d338e0 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1861,6 +1861,8 @@ smb2_copychunk_range(const unsigned int xid, cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); /* Request server copy to target from src identified by key */ + kfree(retbuf); + retbuf = NULL; rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, true /* is_fsctl */, (char *)pcchunk, @@ -3981,6 +3983,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { oplock &= 0xFF; + cinode->lease_granted = false; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { @@ -4007,6 +4010,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int new_oplock = 0; oplock &= 0xFF; + cinode->lease_granted = true; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index a8bf43184773..c205f93e0a10 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -958,6 +958,13 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* Internal types */ server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES; + /* + * SMB3.0 supports only 1 cipher and doesn't have a encryption neg context + * Set the cipher type manually. + */ + if (server->dialect == SMB30_PROT_ID && (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + server->cipher_type = SMB2_ENCRYPTION_AES128_CCM; + security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, (struct smb2_sync_hdr *)rsp); /* @@ -3900,10 +3907,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * Related requests use info from previous read request * in chain. */ - shdr->SessionId = 0xFFFFFFFF; + shdr->SessionId = 0xFFFFFFFFFFFFFFFF; shdr->TreeId = 0xFFFFFFFF; - req->PersistentFileId = 0xFFFFFFFF; - req->VolatileFileId = 0xFFFFFFFF; + req->PersistentFileId = 0xFFFFFFFFFFFFFFFF; + req->VolatileFileId = 0xFFFFFFFFFFFFFFFF; } } if (remaining_bytes > io_parms->length) diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index d6df908dccad..dafcb6ab050d 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -12,6 +12,11 @@ #include <linux/tracepoint.h> +/* + * Please use this 3-part article as a reference for writing new tracepoints: + * https://lwn.net/Articles/379903/ + */ + /* For logging errors in read or write */ DECLARE_EVENT_CLASS(smb3_rw_err_class, TP_PROTO(unsigned int xid, @@ -529,16 +534,16 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class, TP_ARGS(xid, func_name, rc), TP_STRUCT__entry( __field(unsigned int, xid) - __field(const char *, func_name) + __string(func_name, func_name) __field(int, rc) ), TP_fast_assign( __entry->xid = xid; - __entry->func_name = func_name; + __assign_str(func_name, func_name); __entry->rc = rc; ), TP_printk("\t%s: xid=%u rc=%d", - __entry->func_name, __entry->xid, __entry->rc) + __get_str(func_name), __entry->xid, __entry->rc) ) #define DEFINE_SMB3_EXIT_ERR_EVENT(name) \ @@ -583,14 +588,14 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class, TP_ARGS(xid, func_name), TP_STRUCT__entry( __field(unsigned int, xid) - __field(const char *, func_name) + __string(func_name, func_name) ), TP_fast_assign( __entry->xid = xid; - __entry->func_name = func_name; + __assign_str(func_name, func_name); ), TP_printk("\t%s: xid=%u", - __entry->func_name, __entry->xid) + __get_str(func_name), __entry->xid) ) #define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \ @@ -857,16 +862,16 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_STRUCT__entry( __field(__u64, currmid) __field(__u64, conn_id) - __field(char *, hostname) + __string(hostname, hostname) ), TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __entry->hostname = hostname; + __assign_str(hostname, hostname); ), TP_printk("conn_id=0x%llx server=%s current_mid=%llu", __entry->conn_id, - __entry->hostname, + __get_str(hostname), __entry->currmid) ) @@ -891,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_STRUCT__entry( __field(__u64, currmid) __field(__u64, conn_id) - __field(char *, hostname) + __string(hostname, hostname) __field(int, credits) __field(int, credits_to_add) __field(int, in_flight) @@ -899,7 +904,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __entry->hostname = hostname; + __assign_str(hostname, hostname); __entry->credits = credits; __entry->credits_to_add = credits_to_add; __entry->in_flight = in_flight; @@ -907,7 +912,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_printk("conn_id=0x%llx server=%s current_mid=%llu " "credits=%d credit_change=%d in_flight=%d", __entry->conn_id, - __entry->hostname, + __get_str(hostname), __entry->currmid, __entry->credits, __entry->credits_to_add, @@ -144,6 +144,16 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; +/** + * enum dax_wake_mode: waitqueue wakeup behaviour + * @WAKE_ALL: wake all waiters in the waitqueue + * @WAKE_NEXT: wake only the first waiter in the waitqueue + */ +enum dax_wake_mode { + WAKE_ALL, + WAKE_NEXT, +}; + static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry, struct exceptional_entry_key *key) { @@ -182,7 +192,8 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ -static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) +static void dax_wake_entry(struct xa_state *xas, void *entry, + enum dax_wake_mode mode) { struct exceptional_entry_key key; wait_queue_head_t *wq; @@ -196,7 +207,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) * must be in the waitqueue and the following check will see them. */ if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); + __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); } /* @@ -264,11 +275,11 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry) finish_wait(wq, &ewait.wait); } -static void put_unlocked_entry(struct xa_state *xas, void *entry) +static void put_unlocked_entry(struct xa_state *xas, void *entry, + enum dax_wake_mode mode) { - /* If we were the only waiter woken, wake the next one */ if (entry && !dax_is_conflict(entry)) - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, mode); } /* @@ -286,7 +297,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry) old = xas_store(xas, entry); xas_unlock_irq(xas); BUG_ON(!dax_is_locked(old)); - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, WAKE_NEXT); } /* @@ -524,7 +535,7 @@ retry: dax_disassociate_entry(entry, mapping, false); xas_store(xas, NULL); /* undo the PMD join */ - dax_wake_entry(xas, entry, true); + dax_wake_entry(xas, entry, WAKE_ALL); mapping->nrpages -= PG_PMD_NR; entry = NULL; xas_set(xas, index); @@ -622,7 +633,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, entry = get_unlocked_entry(&xas, 0); if (entry) page = dax_busy_page(entry); - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_NEXT); if (page) break; if (++scanned % XA_CHECK_SCHED) @@ -664,7 +675,7 @@ static int __dax_invalidate_entry(struct address_space *mapping, mapping->nrpages -= 1UL << dax_entry_order(entry); ret = 1; out: - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_ALL); xas_unlock_irq(&xas); return ret; } @@ -937,13 +948,13 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_lock_irq(xas); xas_store(xas, entry); xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, WAKE_NEXT); trace_dax_writeback_one(mapping->host, index, count); return ret; put_unlocked: - put_unlocked_entry(xas, entry); + put_unlocked_entry(xas, entry, WAKE_NEXT); return ret; } @@ -1684,7 +1695,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /* Did we race with someone splitting entry or so? */ if (!entry || dax_is_conflict(entry) || (order == 0 && !dax_is_pte_entry(entry))) { - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_NEXT); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 1d252164d97b..8129a430d789 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -45,10 +45,13 @@ static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; static int debugfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *ia) { - int ret = security_locked_down(LOCKDOWN_DEBUGFS); + int ret; - if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) - return ret; + if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) { + ret = security_locked_down(LOCKDOWN_DEBUGFS); + if (ret) + return ret; + } return simple_setattr(&init_user_ns, dentry, ia); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 345f8061e3b4..e3f5d7f3c8a0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -296,10 +296,6 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, struct extent_crypt_result ecr; int rc = 0; - if (!crypt_stat || !crypt_stat->tfm - || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) - return -EINVAL; - if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n", crypt_stat->key_size); diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index e62d813756f2..efaf32596b97 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -450,14 +450,31 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, lcn = m->lcn + 1; if (m->compressedlcs) goto out; - if (lcn == initial_lcn) - goto err_bonus_cblkcnt; err = z_erofs_load_cluster_from_disk(m, lcn); if (err) return err; + /* + * If the 1st NONHEAD lcluster has already been handled initially w/o + * valid compressedlcs, which means at least it mustn't be CBLKCNT, or + * an internal implemenatation error is detected. + * + * The following code can also handle it properly anyway, but let's + * BUG_ON in the debugging mode only for developers to notice that. + */ + DBG_BUGON(lcn == initial_lcn && + m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD); + switch (m->type) { + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + /* + * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type + * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. + */ + m->compressedlcs = 1; + break; case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: if (m->delta[0] != 1) goto err_bonus_cblkcnt; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 53b13787eb2c..925a5ca3744a 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -117,19 +117,6 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) f2fs_drop_rpages(cc, len, true); } -static void f2fs_put_rpages_mapping(struct address_space *mapping, - pgoff_t start, int len) -{ - int i; - - for (i = 0; i < len; i++) { - struct page *page = find_get_page(mapping, start + i); - - put_page(page); - put_page(page); - } -} - static void f2fs_put_rpages_wbc(struct compress_ctx *cc, struct writeback_control *wbc, bool redirty, int unlock) { @@ -158,13 +145,14 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc) return cc->rpages ? 0 : -ENOMEM; } -void f2fs_destroy_compress_ctx(struct compress_ctx *cc) +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; - cc->cluster_idx = NULL_CLUSTER; + if (!reuse) + cc->cluster_idx = NULL_CLUSTER; } void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) @@ -1036,7 +1024,7 @@ retry: } if (PageUptodate(page)) - unlock_page(page); + f2fs_put_page(page, 1); else f2fs_compress_ctx_add_page(cc, page); } @@ -1046,33 +1034,35 @@ retry: ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, false, true); - f2fs_destroy_compress_ctx(cc); + f2fs_put_rpages(cc); + f2fs_destroy_compress_ctx(cc, true); if (ret) - goto release_pages; + goto out; if (bio) f2fs_submit_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) - goto release_pages; + goto out; } for (i = 0; i < cc->cluster_size; i++) { f2fs_bug_on(sbi, cc->rpages[i]); page = find_lock_page(mapping, start_idx + i); - f2fs_bug_on(sbi, !page); + if (!page) { + /* page can be truncated */ + goto release_and_retry; + } f2fs_wait_on_page_writeback(page, DATA, true, true); - f2fs_compress_ctx_add_page(cc, page); - f2fs_put_page(page, 0); if (!PageUptodate(page)) { +release_and_retry: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); - f2fs_put_rpages_mapping(mapping, start_idx, - cc->cluster_size); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); goto retry; } } @@ -1103,10 +1093,10 @@ retry: } unlock_pages: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); -release_pages: - f2fs_put_rpages_mapping(mapping, start_idx, i); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); +out: return ret; } @@ -1141,7 +1131,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, set_cluster_dirty(&cc); f2fs_put_rpages_wbc(&cc, NULL, false, 1); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); return first_index; } @@ -1361,7 +1351,7 @@ unlock_continue: f2fs_put_rpages(cc); page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: @@ -1372,7 +1362,8 @@ out_destroy_crypt: for (i = 0; i < cc->nr_cpages; i++) { if (!cc->cpages[i]) continue; - f2fs_put_page(cc->cpages[i], 1); + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; } out_put_cic: kmem_cache_free(cic_entry_slab, cic); @@ -1522,7 +1513,7 @@ write: err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); f2fs_put_rpages_wbc(cc, wbc, false, 0); destroy_out: - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return err; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 96f1a354f89f..009a09fb9d88 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2287,7 +2287,7 @@ static int f2fs_mpage_readpages(struct inode *inode, max_nr_pages, &last_block_in_bio, rac != NULL, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; } @@ -2332,7 +2332,7 @@ next_page: max_nr_pages, &last_block_in_bio, rac != NULL, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); } } #endif @@ -3033,7 +3033,7 @@ next: } } if (f2fs_compressed_file(inode)) - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); #endif if (retry) { index = 0; @@ -3801,6 +3801,7 @@ static int f2fs_is_file_aligned(struct inode *inode) block_t pblock; unsigned long nr_pblocks; unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; int ret = 0; cur_lblock = 0; @@ -3833,13 +3834,20 @@ static int f2fs_is_file_aligned(struct inode *inode) if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || nr_pblocks & (blocks_per_sec - 1)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; } cur_lblock += nr_pblocks; } + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } @@ -3858,6 +3866,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, int nr_extents = 0; unsigned long nr_pblocks; unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; int ret = 0; /* @@ -3887,7 +3896,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { f2fs_err(sbi, "Swapfile has holes\n"); - ret = -ENOENT; + ret = -EINVAL; goto out; } @@ -3896,9 +3905,12 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || nr_pblocks & (blocks_per_sec - 1)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; } if (cur_lblock + nr_pblocks >= sis->max) @@ -3927,6 +3939,11 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; + + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; } @@ -4035,7 +4052,7 @@ out: return ret; bad_bmap: f2fs_err(sbi, "Swapfile has holes\n"); - return -ENOENT; + return -EINVAL; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 044878866ca3..c83d90125ebd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3956,7 +3956,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); -void f2fs_destroy_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 44a4650aea7b..ceb575f99048 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1817,7 +1817,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) struct f2fs_inode_info *fi = F2FS_I(inode); u32 masked_flags = fi->i_flags & mask; - f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask)); + /* mask can be shrunk by flags_valid selector */ + iflags &= mask; /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c605415840b5..51dc79fad4fe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3574,12 +3574,12 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) return err; drop_bio: - if (fio->bio) { + if (fio->bio && *(fio->bio)) { struct bio *bio = *(fio->bio); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - fio->bio = NULL; + *(fio->bio) = NULL; } return err; } diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index a930ddd15681..7054a542689f 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -598,13 +598,15 @@ void hfsplus_file_truncate(struct inode *inode) res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); if (res) break; - hfs_brec_remove(&fd); - mutex_unlock(&fd.tree->tree_lock); start = hip->cached_start; + if (blk_cnt <= start) + hfs_brec_remove(&fd); + mutex_unlock(&fd.tree->tree_lock); hfsplus_free_extents(sb, hip->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->cached_extents); + mutex_lock(&fd.tree->tree_lock); if (blk_cnt > start) { hip->extent_state |= HFSPLUS_EXT_DIRTY; break; @@ -612,7 +614,6 @@ void hfsplus_file_truncate(struct inode *inode) alloc_cnt = start; hip->cached_start = hip->cached_blocks = 0; hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); - mutex_lock(&fd.tree->tree_lock); } hfs_find_exit(&fd); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a2a42335e8fd..55efd3dd04f6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -131,6 +131,7 @@ static void huge_pagevec_release(struct pagevec *pvec) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -146,6 +147,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; vma->vm_ops = &hugetlb_vm_ops; + ret = seal_check_future_write(info->seals, vma); + if (ret) + return ret; + /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can @@ -524,7 +529,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * the subpool and global reserve usage count can need * to be adjusted. */ - VM_BUG_ON(PagePrivate(page)); + VM_BUG_ON(HPageRestoreReserve(page)); remove_huge_page(page); freed++; if (!truncate_op) { diff --git a/fs/io-wq.c b/fs/io-wq.c index 5361a9b4b47b..b3e8624a37d0 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -979,13 +979,16 @@ static bool io_task_work_match(struct callback_head *cb, void *data) return cwd->wqe->wq == data; } +void io_wq_exit_start(struct io_wq *wq) +{ + set_bit(IO_WQ_BIT_EXIT, &wq->state); +} + static void io_wq_exit_workers(struct io_wq *wq) { struct callback_head *cb; int node; - set_bit(IO_WQ_BIT_EXIT, &wq->state); - if (!wq->task) return; @@ -1003,13 +1006,16 @@ static void io_wq_exit_workers(struct io_wq *wq) struct io_wqe *wqe = wq->wqes[node]; io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); - spin_lock_irq(&wq->hash->wait.lock); - list_del_init(&wq->wqes[node]->wait.entry); - spin_unlock_irq(&wq->hash->wait.lock); } rcu_read_unlock(); io_worker_ref_put(wq); wait_for_completion(&wq->worker_done); + + for_each_node(node) { + spin_lock_irq(&wq->hash->wait.lock); + list_del_init(&wq->wqes[node]->wait.entry); + spin_unlock_irq(&wq->hash->wait.lock); + } put_task_struct(wq->task); wq->task = NULL; } @@ -1020,8 +1026,6 @@ static void io_wq_destroy(struct io_wq *wq) cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - io_wq_exit_workers(wq); - for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; struct io_cb_cancel_data match = { @@ -1036,16 +1040,13 @@ static void io_wq_destroy(struct io_wq *wq) kfree(wq); } -void io_wq_put(struct io_wq *wq) -{ - if (refcount_dec_and_test(&wq->refs)) - io_wq_destroy(wq); -} - void io_wq_put_and_exit(struct io_wq *wq) { + WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state)); + io_wq_exit_workers(wq); - io_wq_put(wq); + if (refcount_dec_and_test(&wq->refs)) + io_wq_destroy(wq); } static bool io_wq_worker_affinity(struct io_worker *worker, void *data) diff --git a/fs/io-wq.h b/fs/io-wq.h index 0e6d310999e8..af2df0680ee2 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -122,7 +122,7 @@ struct io_wq_data { }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); -void io_wq_put(struct io_wq *wq); +void io_wq_exit_start(struct io_wq *wq); void io_wq_put_and_exit(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); diff --git a/fs/io_uring.c b/fs/io_uring.c index f46acbbeed57..903458afd56c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -100,6 +100,8 @@ #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) +#define IORING_MAX_REG_BUFFERS (1U << 14) + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ IOSQE_BUFFER_SELECT) @@ -4035,7 +4037,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #if defined(CONFIG_EPOLL) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -4150,7 +4152,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->buf_index) return -EINVAL; @@ -5017,10 +5019,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, * Can't handle multishot for double wait for now, turn it * into one-shot mode. */ - if (!(req->poll.events & EPOLLONESHOT)) - req->poll.events |= EPOLLONESHOT; + if (!(poll_one->events & EPOLLONESHOT)) + poll_one->events |= EPOLLONESHOT; /* double add on the same waitqueue head, ignore */ - if (poll->head == head) + if (poll_one->head == head) return; poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { @@ -5827,8 +5829,6 @@ done: static int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL)) - return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->ioprio || sqe->rw_flags) @@ -6354,19 +6354,20 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (prev && req_ref_inc_not_zero(prev)) + if (prev) { io_remove_next_linked(prev); - else - prev = NULL; + if (!req_ref_inc_not_zero(prev)) + prev = NULL; + } spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req_deferred(prev, 1); + io_put_req_deferred(req, 1); } else { io_req_complete_post(req, -ETIME, 0); } - io_put_req_deferred(req, 1); return HRTIMER_NORESTART; } @@ -8390,7 +8391,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, if (ctx->user_bufs) return -EBUSY; - if (!nr_args || nr_args > UIO_MAXIOV) + if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; ret = io_rsrc_node_switch_start(ctx); if (ret) @@ -9034,14 +9035,19 @@ static void io_uring_del_task_file(unsigned long index) static void io_uring_clean_tctx(struct io_uring_task *tctx) { + struct io_wq *wq = tctx->io_wq; struct io_tctx_node *node; unsigned long index; xa_for_each(&tctx->xa, index, node) io_uring_del_task_file(index); - if (tctx->io_wq) { - io_wq_put_and_exit(tctx->io_wq); + if (wq) { + /* + * Must be after io_uring_del_task_file() (removes nodes under + * uring_lock) to avoid race with io_uring_try_cancel_iowq(). + */ tctx->io_wq = NULL; + io_wq_put_and_exit(wq); } } @@ -9077,6 +9083,9 @@ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd) if (!current->io_uring) return; + if (tctx->io_wq) + io_wq_exit_start(tctx->io_wq); + WARN_ON_ONCE(!sqd || sqd->thread != current); atomic_inc(&tctx->in_idle); @@ -9111,6 +9120,9 @@ void __io_uring_cancel(struct files_struct *files) DEFINE_WAIT(wait); s64 inflight; + if (tctx->io_wq) + io_wq_exit_start(tctx->io_wq); + /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); do { diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f2cd2034a87b..9023717c5188 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -394,7 +394,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { struct inode *inode = rac->mapping->host; loff_t pos = readahead_pos(rac); - loff_t length = readahead_length(rac); + size_t length = readahead_length(rac); struct iomap_readpage_ctx ctx = { .rac = rac, }; @@ -402,7 +402,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) trace_iomap_readahead(inode, readahead_count(rac)); while (length > 0) { - loff_t ret = iomap_apply(inode, pos, length, 0, ops, + ssize_t ret = iomap_apply(inode, pos, length, 0, ops, &ctx, iomap_readahead_actor); if (ret <= 0) { WARN_ON_ONCE(ret == 0); diff --git a/fs/namespace.c b/fs/namespace.c index f63337828e1c..c3f1a78ba369 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3855,8 +3855,12 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; + /* Don't yet support filesystem mountable in user namespaces. */ + if (m->mnt_sb->s_user_ns != &init_user_ns) + return -EINVAL; + /* We're not controlling the superblock. */ - if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* Mount has already been visible in the filesystem hierarchy. */ diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index 578112713703..b4db21022cb4 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NETFS_SUPPORT - tristate "Support for network filesystem high-level I/O" + tristate help This option enables support for network filesystems, including helpers for high-level buffered I/O, abstracting out read diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 193841d03de0..725614625ed4 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -1068,7 +1068,7 @@ int netfs_write_begin(struct file *file, struct address_space *mapping, DEFINE_READAHEAD(ractl, file, NULL, mapping, index); retry: - page = grab_cache_page_write_begin(mapping, index, 0); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d158a500c25c..d2103852475f 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -718,7 +718,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, if (unlikely(!p)) goto out_err; fl->fh_array[i]->size = be32_to_cpup(p++); - if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + if (fl->fh_array[i]->size > NFS_MAXFHSIZE) { printk(KERN_ERR "NFS: Too big fh %d received %d\n", i, fl->fh_array[i]->size); goto out_err; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 93e60e921f92..bc0c698f3350 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -362,7 +362,7 @@ static const struct kernel_param_ops param_ops_nfs_timeout = { .set = param_set_nfs_timeout, .get = param_get_nfs_timeout, }; -#define param_check_nfs_timeout(name, p) __param_check(name, p, int); +#define param_check_nfs_timeout(name, p) __param_check(name, p, int) module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644); MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 57b3821d975a..a1e5c6b85ded 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -211,7 +211,7 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) case SEEK_HOLE: case SEEK_DATA: ret = nfs42_proc_llseek(filep, offset, whence); - if (ret != -ENOTSUPP) + if (ret != -EOPNOTSUPP) return ret; fallthrough; default: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 87d04f2c9385..0cd965882232 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1706,7 +1706,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, rcu_read_unlock(); trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); - if (!signal_pending(current)) { + if (!fatal_signal_pending(current)) { if (schedule_timeout(5*HZ) == 0) status = -EAGAIN; else @@ -3487,7 +3487,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, write_sequnlock(&state->seqlock); trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); - if (signal_pending(current)) + if (fatal_signal_pending(current)) status = -EINTR; else if (schedule_timeout(5*HZ) != 0) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6c20b28d9d7c..cf9cc62ec48e 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1094,15 +1094,16 @@ nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *prev = NULL; unsigned int size; - if (mirror->pg_count != 0) { - prev = nfs_list_entry(mirror->pg_list.prev); - } else { + if (list_empty(&mirror->pg_list)) { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); if (desc->pg_error < 0) return 0; mirror->pg_base = req->wb_pgbase; - } + mirror->pg_count = 0; + mirror->pg_recoalesce = 0; + } else + prev = nfs_list_entry(mirror->pg_list.prev); if (desc->pg_maxretrans && req->wb_nio > desc->pg_maxretrans) { if (NFS_SERVER(desc->pg_inode)->flags & NFS_MOUNT_SOFTERR) @@ -1127,18 +1128,13 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - if (!list_empty(&mirror->pg_list)) { int error = desc->pg_ops->pg_doio(desc); if (error < 0) desc->pg_error = error; - else + if (list_empty(&mirror->pg_list)) mirror->pg_bytes_written += mirror->pg_count; } - if (list_empty(&mirror->pg_list)) { - mirror->pg_count = 0; - mirror->pg_base = 0; - } } static void @@ -1227,10 +1223,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) do { list_splice_init(&mirror->pg_list, &head); - mirror->pg_bytes_written -= mirror->pg_count; - mirror->pg_count = 0; - mirror->pg_base = 0; - mirror->pg_recoalesce = 0; while (!list_empty(&head)) { struct nfs_page *req; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 03e0b34c4a64..2c01ee805306 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1317,6 +1317,11 @@ _pnfs_return_layout(struct inode *ino) { struct pnfs_layout_hdr *lo = NULL; struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; LIST_HEAD(tmp_list); const struct cred *cred; nfs4_stateid stateid; @@ -1344,16 +1349,10 @@ _pnfs_return_layout(struct inode *ino) } valid_layout = pnfs_layout_is_valid(lo); pnfs_clear_layoutcommit(ino, &tmp_list); - pnfs_mark_matching_lsegs_return(lo, &tmp_list, NULL, 0); + pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0); - if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); - } /* Don't send a LAYOUTRETURN if list was initially empty */ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) || @@ -2678,7 +2677,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - u64 rd_size = req->wb_bytes; + u64 rd_size; pnfs_generic_pg_check_layout(pgio); pnfs_generic_pg_check_range(pgio, req); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 19a212f9725d..fe58525cfed4 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1379,7 +1379,7 @@ static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; -#define param_check_portnr(name, p) __param_check(name, p, unsigned int); +#define param_check_portnr(name, p) __param_check(name, p, unsigned int) module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644); diff --git a/fs/proc/base.c b/fs/proc/base.c index 3851bfcdba56..58bbf334265b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2703,6 +2703,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, void *page; int rv; + /* A task may only write when it was the opener. */ + if (file->f_cred != current_real_cred()) + return -EPERM; + rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (!task) { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 4f1373463766..22d904bde6ab 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -288,14 +288,12 @@ static inline void remove_dquot_hash(struct dquot *dquot) static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, struct kqid qid) { - struct hlist_node *node; struct dquot *dquot; - hlist_for_each (node, dquot_hash+hashent) { - dquot = hlist_entry(node, struct dquot, dq_hash); + hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash) if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid)) return dquot; - } + return NULL; } diff --git a/fs/signalfd.c b/fs/signalfd.c index 040a1142915f..167b5889db4b 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -114,29 +114,24 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, break; case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: + case SIL_PERF_EVENT: /* - * Fall through to the SIL_FAULT case. Both SIL_FAULT_BNDERR - * and SIL_FAULT_PKUERR are only generated by faults that - * deliver them synchronously to userspace. In case someone - * injects one of these signals and signalfd catches it treat - * it as SIL_FAULT. + * Fall through to the SIL_FAULT case. SIL_FAULT_BNDERR, + * SIL_FAULT_PKUERR, and SIL_PERF_EVENT are only + * generated by faults that deliver them synchronously to + * userspace. In case someone injects one of these signals + * and signalfd catches it treat it as SIL_FAULT. */ case SIL_FAULT: new.ssi_addr = (long) kinfo->si_addr; -#ifdef __ARCH_SI_TRAPNO - new.ssi_trapno = kinfo->si_trapno; -#endif break; - case SIL_FAULT_MCEERR: + case SIL_FAULT_TRAPNO: new.ssi_addr = (long) kinfo->si_addr; -#ifdef __ARCH_SI_TRAPNO new.ssi_trapno = kinfo->si_trapno; -#endif - new.ssi_addr_lsb = (short) kinfo->si_addr_lsb; break; - case SIL_PERF_EVENT: + case SIL_FAULT_MCEERR: new.ssi_addr = (long) kinfo->si_addr; - new.ssi_perf = kinfo->si_perf; + new.ssi_addr_lsb = (short) kinfo->si_addr_lsb; break; case SIL_CHLD: new.ssi_pid = kinfo->si_pid; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 7b1128398976..89d492916dea 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -211,11 +211,11 @@ failure: * If the skip factor is limited in this way then the file will use multiple * slots. */ -static inline int calculate_skip(int blocks) +static inline int calculate_skip(u64 blocks) { - int skip = blocks / ((SQUASHFS_META_ENTRIES + 1) + u64 skip = blocks / ((SQUASHFS_META_ENTRIES + 1) * SQUASHFS_META_INDEXES); - return min(SQUASHFS_CACHED_BLKS - 1, skip + 1); + return min((u64) SQUASHFS_CACHED_BLKS - 1, skip + 1); } diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e32a1833d523..bbfea8022a3b 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -325,10 +325,22 @@ out: error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0); if (error2) return error2; - ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + - xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= - pag->pagf_freeblks + pag->pagf_flcount); + + /* + * If there isn't enough space in the AG to satisfy the + * reservation, let the caller know that there wasn't enough + * space. Callers are responsible for deciding what to do + * next, since (in theory) we can stumble along with + * insufficient reservation if data blocks are being freed to + * replenish the AG's free space. + */ + if (!error && + xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved > + pag->pagf_freeblks + pag->pagf_flcount) + error = -ENOSPC; } + return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 7e3b9b01431e..a3e0e6f672d6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -605,7 +605,6 @@ xfs_bmap_btree_to_extents( ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); - ASSERT(!xfs_need_iread_extents(ifp)); ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); @@ -5350,7 +5349,6 @@ __xfs_bunmapi( xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t max_len; - xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; @@ -5442,16 +5440,6 @@ __xfs_bunmapi( del = got; wasdel = isnullstartblock(del.br_startblock); - /* - * Make sure we don't touch multiple AGF headers out of order - * in a single transaction, as that could cause AB-BA deadlocks. - */ - if (!wasdel && !isrt) { - agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); - if (prev_agno != NULLAGNUMBER && prev_agno > agno) - break; - prev_agno = agno; - } if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index a83bdd0c47a8..bde2b4c64dbe 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -770,6 +770,8 @@ struct xfs_scrub_metadata { /* * ioctl commands that are used by Linux filesystems */ +#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS +#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS #define XFS_IOC_GETVERSION FS_IOC_GETVERSION /* @@ -780,6 +782,8 @@ struct xfs_scrub_metadata { #define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) #define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) +#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 5c9a7440d9e4..f3254a4f4cb4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -559,8 +559,17 @@ xfs_dinode_calc_crc( /* * Validate di_extsize hint. * - * The rules are documented at xfs_ioctl_setattr_check_extsize(). - * These functions must be kept in sync with each other. + * 1. Extent size hint is only valid for directories and regular files. + * 2. FS_XFLAG_EXTSIZE is only valid for regular files. + * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. + * 4. Hint cannot be larger than MAXTEXTLEN. + * 5. Can be changed on directories at any time. + * 6. Hint value of 0 turns off hints, clears inode flags. + * 7. Extent size must be a multiple of the appropriate block size. + * For realtime files, this is the rt extent size. + * 8. For non-realtime files, the extent size hint must be limited + * to half the AG size to avoid alignment extending the extent beyond the + * limits of the AG. */ xfs_failaddr_t xfs_inode_validate_extsize( @@ -580,6 +589,28 @@ xfs_inode_validate_extsize( inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); extsize_bytes = XFS_FSB_TO_B(mp, extsize); + /* + * This comment describes a historic gap in this verifier function. + * + * On older kernels, the extent size hint verifier doesn't check that + * the extent size hint is an integer multiple of the realtime extent + * size on a directory with both RTINHERIT and EXTSZINHERIT flags set. + * The verifier has always enforced the alignment rule for regular + * files with the REALTIME flag set. + * + * If a directory with a misaligned extent size hint is allowed to + * propagate that hint into a new regular realtime file, the result + * is that the inode cluster buffer verifier will trigger a corruption + * shutdown the next time it is run. + * + * Unfortunately, there could be filesystems with these misconfigured + * directories in the wild, so we cannot add a check to this verifier + * at this time because that will result a new source of directory + * corruption errors when reading an existing filesystem. Instead, we + * permit the misconfiguration to pass through the verifiers so that + * callers of this function can correct and mitigate externally. + */ + if (rt_flag) blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; else @@ -616,8 +647,15 @@ xfs_inode_validate_extsize( /* * Validate di_cowextsize hint. * - * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). - * These functions must be kept in sync with each other. + * 1. CoW extent size hint can only be set if reflink is enabled on the fs. + * The inode does not have to have any shared blocks, but it must be a v3. + * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; + * for a directory, the hint is propagated to new files. + * 3. Can be changed on files & directories at any time. + * 4. Hint value of 0 turns off hints, clears inode flags. + * 5. Extent size must be a multiple of the appropriate block size. + * 6. The extent size hint must be limited to half the AG size to avoid + * alignment extending the extent beyond the limits of the AG. */ xfs_failaddr_t xfs_inode_validate_cowextsize( diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 78324e043e25..8d595a5c4abd 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -143,6 +143,23 @@ xfs_trans_log_inode( } /* + * Inode verifiers on older kernels don't check that the extent size + * hint is an integer multiple of the rt extent size on a directory + * with both rtinherit and extszinherit flags set. If we're logging a + * directory that is misconfigured in this way, clear the hint. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + xfs_info_once(ip->i_mount, + "Correcting misaligned extent size hint in inode 0x%llx.", ip->i_ino); + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + + /* * Record the specific change for fdatasync optimisation. This allows * fdatasync to skip log forces for inodes that are only timestamp * dirty. diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index aa874607618a..be38c960da85 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -74,7 +74,9 @@ __xchk_process_error( return true; case -EDEADLOCK: /* Used to restart an op with deadlock avoidance. */ - trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); + trace_xchk_deadlock_retry( + sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), + sc->sm, *error); break; case -EFSBADCRC: case -EFSCORRUPTED: diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a5e9d7d34023..0936f3a96fe6 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -71,18 +71,24 @@ xfs_zero_extent( #ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc( - struct xfs_bmalloca *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) { - int error; /* error return value */ - xfs_mount_t *mp; /* mount point structure */ - xfs_extlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t mod = 0; /* product factor for allocators */ - xfs_extlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_rtblock_t rtb; - - mp = ap->ip->i_mount; + struct xfs_mount *mp = ap->ip->i_mount; + xfs_fileoff_t orig_offset = ap->offset; + xfs_rtblock_t rtb; + xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t mod = 0; /* product factor for allocators */ + xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t orig_length = ap->length; + xfs_extlen_t minlen = mp->m_sb.sb_rextsize; + xfs_extlen_t raminlen; + bool rtlocked = false; + bool ignore_locality = false; + int error; + align = xfs_get_extsz_hint(ap->ip); +retry: prod = align / mp->m_sb.sb_rextsize; error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, @@ -93,6 +99,15 @@ xfs_bmap_rtalloc( ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); /* + * If we shifted the file offset downward to satisfy an extent size + * hint, increase minlen by that amount so that the allocator won't + * give us an allocation that's too short to cover at least one of the + * blocks that the caller asked for. + */ + if (ap->offset != orig_offset) + minlen += orig_offset - ap->offset; + + /* * If the offset & length are not perfectly aligned * then kill prod, it will just get us in trouble. */ @@ -116,10 +131,13 @@ xfs_bmap_rtalloc( /* * Lock out modifications to both the RT bitmap and summary inodes */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + if (!rtlocked) { + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); + xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + rtlocked = true; + } /* * If it's an allocation to an empty file at offset 0, @@ -141,33 +159,59 @@ xfs_bmap_rtalloc( /* * Realtime allocation, done through xfs_rtallocate_extent. */ - do_div(ap->blkno, mp->m_sb.sb_rextsize); + if (ignore_locality) + ap->blkno = 0; + else + do_div(ap->blkno, mp->m_sb.sb_rextsize); rtb = ap->blkno; ap->length = ralen; - error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, - &ralen, ap->wasdel, prod, &rtb); + raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize); + error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length, + &ralen, ap->wasdel, prod, &rtb); if (error) return error; - ap->blkno = rtb; - if (ap->blkno != NULLFSBLOCK) { - ap->blkno *= mp->m_sb.sb_rextsize; - ralen *= mp->m_sb.sb_rextsize; - ap->length = ralen; - ap->ip->i_nblocks += ralen; + if (rtb != NULLRTBLOCK) { + ap->blkno = rtb * mp->m_sb.sb_rextsize; + ap->length = ralen * mp->m_sb.sb_rextsize; + ap->ip->i_nblocks += ap->length; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) - ap->ip->i_delayed_blks -= ralen; + ap->ip->i_delayed_blks -= ap->length; /* * Adjust the disk quota also. This was reserved * earlier. */ xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : - XFS_TRANS_DQ_RTBCOUNT, (long) ralen); - } else { - ap->length = 0; + XFS_TRANS_DQ_RTBCOUNT, ap->length); + return 0; } + + if (align > mp->m_sb.sb_rextsize) { + /* + * We previously enlarged the request length to try to satisfy + * an extent size hint. The allocator didn't return anything, + * so reset the parameters to the original values and try again + * without alignment criteria. + */ + ap->offset = orig_offset; + ap->length = orig_length; + minlen = align = mp->m_sb.sb_rextsize; + goto retry; + } + + if (!ignore_locality && ap->blkno != 0) { + /* + * If we can't allocate near a specific rt extent, try again + * without locality criteria. + */ + ignore_locality = true; + goto retry; + } + + ap->blkno = NULLFSBLOCK; + ap->length = 0; return 0; } #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0369eb22c1bb..e4c2da4566f1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -690,6 +690,7 @@ xfs_inode_inherit_flags( const struct xfs_inode *pip) { unsigned int di_flags = 0; + xfs_failaddr_t failaddr; umode_t mode = VFS_I(ip)->i_mode; if (S_ISDIR(mode)) { @@ -729,6 +730,24 @@ xfs_inode_inherit_flags( di_flags |= XFS_DIFLAG_FILESTREAM; ip->i_diflags |= di_flags; + + /* + * Inode verifiers on older kernels only check that the extent size + * hint is an integer multiple of the rt extent size on realtime files. + * They did not check the hint alignment on a directory with both + * rtinherit and extszinherit flags set. If the misaligned hint is + * propagated from a directory into a new realtime file, new file + * allocations will fail due to math errors in the rt allocator and/or + * trip the verifiers. Validate the hint settings in the new file so + * that we don't let broken hints propagate. + */ + failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, + VFS_I(ip)->i_mode, ip->i_diflags); + if (failaddr) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + } } /* Propagate di_flags2 from a parent inode to a child inode. */ @@ -737,12 +756,22 @@ xfs_inode_inherit_flags2( struct xfs_inode *ip, const struct xfs_inode *pip) { + xfs_failaddr_t failaddr; + if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_cowextsize = pip->i_cowextsize; } if (pip->i_diflags2 & XFS_DIFLAG2_DAX) ip->i_diflags2 |= XFS_DIFLAG2_DAX; + + /* Don't let invalid cowextsize hints propagate. */ + failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, + VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); + if (failaddr) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + } } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3925bfcb2365..1fe4c1fc0aea 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1267,20 +1267,8 @@ out_error: } /* - * extent size hint validation is somewhat cumbersome. Rules are: - * - * 1. extent size hint is only valid for directories and regular files - * 2. FS_XFLAG_EXTSIZE is only valid for regular files - * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. - * 4. can only be changed on regular files if no extents are allocated - * 5. can be changed on directories at any time - * 6. extsize hint of 0 turns off hints, clears inode flags. - * 7. Extent size must be a multiple of the appropriate block size. - * 8. for non-realtime files, the extent size hint must be limited - * to half the AG size to avoid alignment extending the extent beyond the - * limits of the AG. - * - * Please keep this function in sync with xfs_scrub_inode_extsize. + * Validate a proposed extent size hint. For regular files, the hint can only + * be changed if no extents are allocated. */ static int xfs_ioctl_setattr_check_extsize( @@ -1288,86 +1276,65 @@ xfs_ioctl_setattr_check_extsize( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; - xfs_extlen_t size; - xfs_fsblock_t extsize_fsb; + xfs_failaddr_t failaddr; + uint16_t new_diflags; if (!fa->fsx_valid) return 0; if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && - ((ip->i_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) + XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize) return -EINVAL; - if (fa->fsx_extsize == 0) - return 0; - - extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); - if (extsize_fsb > MAXEXTLEN) + if (fa->fsx_extsize & mp->m_blockmask) return -EINVAL; - if (XFS_IS_REALTIME_INODE(ip) || - (fa->fsx_xflags & FS_XFLAG_REALTIME)) { - size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; - } else { - size = mp->m_sb.sb_blocksize; - if (extsize_fsb > mp->m_sb.sb_agblocks / 2) + new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); + + /* + * Inode verifiers on older kernels don't check that the extent size + * hint is an integer multiple of the rt extent size on a directory + * with both rtinherit and extszinherit flags set. Don't let sysadmins + * misconfigure directories. + */ + if ((new_diflags & XFS_DIFLAG_RTINHERIT) && + (new_diflags & XFS_DIFLAG_EXTSZINHERIT)) { + unsigned int rtextsize_bytes; + + rtextsize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + if (fa->fsx_extsize % rtextsize_bytes) return -EINVAL; } - if (fa->fsx_extsize % size) - return -EINVAL; - - return 0; + failaddr = xfs_inode_validate_extsize(ip->i_mount, + XFS_B_TO_FSB(mp, fa->fsx_extsize), + VFS_I(ip)->i_mode, new_diflags); + return failaddr != NULL ? -EINVAL : 0; } -/* - * CoW extent size hint validation rules are: - * - * 1. CoW extent size hint can only be set if reflink is enabled on the fs. - * The inode does not have to have any shared blocks, but it must be a v3. - * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; - * for a directory, the hint is propagated to new files. - * 3. Can be changed on files & directories at any time. - * 4. CoW extsize hint of 0 turns off hints, clears inode flags. - * 5. Extent size must be a multiple of the appropriate block size. - * 6. The extent size hint must be limited to half the AG size to avoid - * alignment extending the extent beyond the limits of the AG. - * - * Please keep this function in sync with xfs_scrub_inode_cowextsize. - */ static int xfs_ioctl_setattr_check_cowextsize( struct xfs_inode *ip, struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; - xfs_extlen_t size; - xfs_fsblock_t cowextsize_fsb; + xfs_failaddr_t failaddr; + uint64_t new_diflags2; + uint16_t new_diflags; if (!fa->fsx_valid) return 0; - if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) - return 0; - - if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb)) + if (fa->fsx_cowextsize & mp->m_blockmask) return -EINVAL; - if (fa->fsx_cowextsize == 0) - return 0; + new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); + new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); - if (cowextsize_fsb > MAXEXTLEN) - return -EINVAL; - - size = mp->m_sb.sb_blocksize; - if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2) - return -EINVAL; - - if (fa->fsx_cowextsize % size) - return -EINVAL; - - return 0; + failaddr = xfs_inode_validate_cowextsize(ip->i_mount, + XFS_B_TO_FSB(mp, fa->fsx_cowextsize), + VFS_I(ip)->i_mode, new_diflags, new_diflags2); + return failaddr != NULL ? -EINVAL : 0; } static int diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 3c392b1512ac..7ec1a9207517 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -73,6 +73,8 @@ do { \ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) #define xfs_notice_once(dev, fmt, ...) \ xfs_printk_once(xfs_notice, dev, fmt, ##__VA_ARGS__) +#define xfs_info_once(dev, fmt, ...) \ + xfs_printk_once(xfs_info, dev, fmt, ##__VA_ARGS__) void assfail(struct xfs_mount *mp, char *expr, char *f, int l); void asswarn(struct xfs_mount *mp, char *expr, char *f, int l); |