diff options
Diffstat (limited to 'fs')
231 files changed, 8098 insertions, 3153 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 6caca025019d..072e7599583a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void) v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; diff --git a/fs/Kconfig b/fs/Kconfig index 2bb1ef86c411..9adee0d7536e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -50,7 +50,8 @@ config FS_DAX_PMD bool default FS_DAX depends on FS_DAX - depends on BROKEN + depends on ZONE_DEVICE + depends on TRANSPARENT_HUGEPAGE endif # BLOCK diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 4d4a0df8344f..c9fdfb112933 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -271,7 +271,7 @@ static int __init init_inodecache(void) adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (adfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/affs/super.c b/fs/affs/super.c index 8836df5f1e11..2a6713b6b9f4 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -132,7 +132,7 @@ static int __init init_inodecache(void) affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (affs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/afs/super.c b/fs/afs/super.c index 1fb4a5129f7d..81afefe7d8a6 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -91,7 +91,7 @@ int __init afs_fs_init(void) afs_inode_cachep = kmem_cache_create("afs_inode_cache", sizeof(struct afs_vnode), 0, - SLAB_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, afs_i_init_once); if (!afs_inode_cachep) { printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n"); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 25250fa87086..cc0e08252913 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -434,7 +434,7 @@ befs_init_inodecache(void) befs_inode_cachep = kmem_cache_create("befs_inode_cache", sizeof (struct befs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (befs_inode_cachep == NULL) { pr_err("%s: Couldn't initialize inode slabcache\n", __func__); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fdcb4d69f430..1e5c896f6b79 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -270,7 +270,7 @@ static int __init init_inodecache(void) bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (bfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/block_dev.c b/fs/block_dev.c index 01b8e0d4b4ff..530145b607c4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct inode *inode = bdev_file_inode(file); if (IS_DAX(inode)) return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, @@ -338,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) { - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t retval; mutex_lock(&bd_inode->i_mutex); @@ -349,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *bd_inode = filp->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); int error; @@ -432,7 +437,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); + result = blk_queue_enter(bdev->bd_queue, GFP_NOIO); if (result) return result; @@ -450,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page); /** * bdev_direct_access() - Get the address for directly-accessibly memory * @bdev: The device containing the memory - * @sector: The offset within the device - * @addr: Where to put the address of the memory - * @pfn: The Page Frame Number for the memory - * @size: The number of bytes requested + * @dax: control and output parameters for ->direct_access * * If a block device is made up of directly addressable memory, this function * will tell the caller the PFN and the address of the memory. The address @@ -464,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page); * Return: negative errno if an error occurs, otherwise the number of bytes * accessible at this address. */ -long bdev_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **addr, unsigned long *pfn, long size) +long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) { - long avail; + sector_t sector = dax->sector; + long avail, size = dax->size; const struct block_device_operations *ops = bdev->bd_disk->fops; /* @@ -486,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, addr, pfn); + avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); if (!avail) return -ERANGE; + if (avail > 0 && avail & ~PAGE_MASK) + return -ENXIO; return min(avail, size); } EXPORT_SYMBOL_GPL(bdev_direct_access); @@ -590,7 +594,7 @@ void __init bdev_cache_init(void) bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) @@ -1224,8 +1228,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + if (!blkdev_dax_capable(bdev)) + bdev->bd_inode->i_flags &= ~S_DAX; + } /* * If the device is invalidated, rescan partition @@ -1239,6 +1246,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) else if (ret == -ENOMEDIUM) invalidate_partitions(disk, bdev); } + if (ret) goto out_clear; } else { @@ -1259,12 +1267,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - /* - * If the partition is not aligned on a page - * boundary, we can't do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) || - (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + if (!blkdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { @@ -1599,14 +1602,14 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { - struct block_device *bdev = I_BDEV(filp->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); blkdev_put(bdev, filp->f_mode); return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - struct block_device *bdev = I_BDEV(file->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); fmode_t mode = file->f_mode; /* @@ -1631,7 +1634,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; ssize_t ret; @@ -1663,7 +1666,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter); ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; @@ -1702,13 +1705,101 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; +#ifdef CONFIG_FS_DAX +/* + * In the raw block case we do not need to contend with truncation nor + * unwritten file extents. Without those concerns there is no need for + * additional locking beyond the mmap_sem context that these routines + * are already executing under. + * + * Note, there is no protection if the block device is dynamically + * resized (partition grow/shrink) during a fault. A stable block device + * size is already not enforced in the blkdev_direct_IO path. + * + * For DAX, it is the responsibility of the block device driver to + * ensure the whole-disk device size is stable while requests are in + * flight. + * + * Finally, unlike the filemap_page_mkwrite() case there is no + * filesystem superblock to sync against freezing. We still include a + * pfn_mkwrite callback for dax drivers to receive write fault + * notifications. + */ +static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return __dax_fault(vma, vmf, blkdev_get_block, NULL); +} + +static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); +} + +static void blkdev_vm_open(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + mutex_unlock(&bd_inode->i_mutex); +} + +static void blkdev_vm_close(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count--; + mutex_unlock(&bd_inode->i_mutex); +} + +static const struct vm_operations_struct blkdev_dax_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = blkdev_dax_fault, + .pmd_fault = blkdev_dax_pmd_fault, + .pfn_mkwrite = blkdev_dax_fault, +}; + +static const struct vm_operations_struct blkdev_default_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, +}; + +static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(bd_inode); + + file_accessed(file); + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + if (IS_DAX(bd_inode)) { + vma->vm_ops = &blkdev_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + } else { + vma->vm_ops = &blkdev_default_vm_ops; + } + mutex_unlock(&bd_inode->i_mutex); + + return 0; +} +#else +#define blkdev_mmap generic_file_mmap +#endif + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 6d1d0b93b1aa..128ce17a80b0 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -9,11 +9,12 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o hash.o + uuid-tree.o props.o hash.o free-space-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ - tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ + tests/free-space-tree-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index f89db0c21b51..6d263bb1621c 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -48,7 +48,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { - value = kzalloc(size, GFP_NOFS); + value = kzalloc(size, GFP_KERNEL); if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); @@ -102,7 +102,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, if (acl) { size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = kmalloc(size, GFP_KERNEL); if (!value) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 3e36e4adc4a3..88d9af3d4581 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -97,7 +97,7 @@ static struct __btrfs_workqueue * __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, int thresh) { - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; @@ -148,7 +148,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, int limit_active, int thresh) { - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index d453d62ab0c6..08405a3da6b1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -520,13 +520,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1, static int __add_missing_keys(struct btrfs_fs_info *fs_info, struct list_head *head) { - struct list_head *pos; + struct __prelim_ref *ref; struct extent_buffer *eb; - list_for_each(pos, head) { - struct __prelim_ref *ref; - ref = list_entry(pos, struct __prelim_ref, list); - + list_for_each_entry(ref, head, list) { if (ref->parent) continue; if (ref->key_for_search.type) @@ -563,23 +560,15 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, */ static void __merge_refs(struct list_head *head, int mode) { - struct list_head *pos1; + struct __prelim_ref *ref1; - list_for_each(pos1, head) { - struct list_head *n2; - struct list_head *pos2; - struct __prelim_ref *ref1; + list_for_each_entry(ref1, head, list) { + struct __prelim_ref *ref2 = ref1, *tmp; - ref1 = list_entry(pos1, struct __prelim_ref, list); - - for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; - pos2 = n2, n2 = pos2->next) { - struct __prelim_ref *ref2; + list_for_each_entry_safe_continue(ref2, tmp, head, list) { struct __prelim_ref *xchg; struct extent_inode_elem *eie; - ref2 = list_entry(pos2, struct __prelim_ref, list); - if (!ref_for_same_block(ref1, ref2)) continue; if (mode == 1) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0ef5cc13fae2..61205e3bbefa 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -192,6 +192,10 @@ struct btrfs_inode { /* File creation time. */ struct timespec i_otime; + /* Hook into fs_info->delayed_iputs */ + struct list_head delayed_iput; + long delayed_iput_count; + struct inode vfs_inode; }; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0340c57bf377..861d472564c1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -531,13 +531,9 @@ static struct btrfsic_block *btrfsic_block_hashtable_lookup( (((unsigned int)(dev_bytenr >> 16)) ^ ((unsigned int)((uintptr_t)bdev))) & (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block *const b = - list_entry(elem, struct btrfsic_block, - collision_resolving_node); + struct btrfsic_block *b; + list_for_each_entry(b, h->table + hashval, collision_resolving_node) { if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) return b; } @@ -588,13 +584,9 @@ static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( ((unsigned int)((uintptr_t)bdev_ref_to)) ^ ((unsigned int)((uintptr_t)bdev_ref_from))) & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block_link *const l = - list_entry(elem, struct btrfsic_block_link, - collision_resolving_node); + struct btrfsic_block_link *l; + list_for_each_entry(l, h->table + hashval, collision_resolving_node) { BUG_ON(NULL == l->block_ref_to); BUG_ON(NULL == l->block_ref_from); if (l->block_ref_to->dev_state->bdev == bdev_ref_to && @@ -639,13 +631,9 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( const unsigned int hashval = (((unsigned int)((uintptr_t)bdev)) & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_dev_state *const ds = - list_entry(elem, struct btrfsic_dev_state, - collision_resolving_node); + struct btrfsic_dev_state *ds; + list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { if (ds->bdev == bdev) return ds; } @@ -1720,29 +1708,20 @@ static int btrfsic_read_block(struct btrfsic_state *state, static void btrfsic_dump_database(struct btrfsic_state *state) { - struct list_head *elem_all; + const struct btrfsic_block *b_all; BUG_ON(NULL == state); printk(KERN_INFO "all_blocks_list:\n"); - list_for_each(elem_all, &state->all_blocks_list) { - const struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *elem_ref_from; + list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { + const struct btrfsic_block_link *l; printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), b_all->logical_bytenr, b_all->dev_state->name, b_all->dev_bytenr, b_all->mirror_num); - list_for_each(elem_ref_to, &b_all->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { printk(KERN_INFO " %c @%llu (%s/%llu/%d)" " refers %u* to" " %c @%llu (%s/%llu/%d)\n", @@ -1757,12 +1736,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state) l->block_ref_to->mirror_num); } - list_for_each(elem_ref_from, &b_all->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, - struct btrfsic_block_link, - node_ref_from); - + list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { printk(KERN_INFO " %c @%llu (%s/%llu/%d)" " is ref %u* from" " %c @%llu (%s/%llu/%d)\n", @@ -1845,8 +1819,7 @@ again: &state->block_hashtable); if (NULL != block) { u64 bytenr = 0; - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; + struct btrfsic_block_link *l, *tmp; if (block->is_superblock) { bytenr = btrfs_super_bytenr((struct btrfs_super_block *) @@ -1967,13 +1940,8 @@ again: * because it still carries valueable information * like whether it was ever written and IO completed. */ - list_for_each_safe(elem_ref_to, tmp_ref_to, - &block->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry_safe(l, tmp, &block->ref_to_list, + node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) btrfsic_print_rem_link(state, l); l->ref_cnt--; @@ -2436,7 +2404,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, struct btrfsic_block *const block, int recursion_level) { - struct list_head *elem_ref_to; + const struct btrfsic_block_link *l; int ret = 0; if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { @@ -2464,11 +2432,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack * space is very small and the max recursion depth is limited. */ - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "rl=%d, %c @%llu (%s/%llu/%d)" @@ -2561,7 +2525,7 @@ static int btrfsic_is_block_ref_by_superblock( const struct btrfsic_block *block, int recursion_level) { - struct list_head *elem_ref_from; + const struct btrfsic_block_link *l; if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { /* refer to comment at "abort cyclic linkage (case 1)" */ @@ -2576,11 +2540,7 @@ static int btrfsic_is_block_ref_by_superblock( * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - list_for_each(elem_ref_from, &block->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, struct btrfsic_block_link, - node_ref_from); - + list_for_each_entry(l, &block->ref_from_list, node_ref_from) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "rl=%d, %c @%llu (%s/%llu/%d)" @@ -2669,7 +2629,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, const struct btrfsic_block *block, int indent_level) { - struct list_head *elem_ref_to; + const struct btrfsic_block_link *l; int indent_add; static char buf[80]; int cursor_position; @@ -2704,11 +2664,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, } cursor_position = indent_level; - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { while (cursor_position < indent_level) { printk(" "); cursor_position++; @@ -3165,8 +3121,7 @@ int btrfsic_mount(struct btrfs_root *root, void btrfsic_unmount(struct btrfs_root *root, struct btrfs_fs_devices *fs_devices) { - struct list_head *elem_all; - struct list_head *tmp_all; + struct btrfsic_block *b_all, *tmp_all; struct btrfsic_state *state; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; @@ -3206,20 +3161,12 @@ void btrfsic_unmount(struct btrfs_root *root, * just free all memory that was allocated dynamically. * Free the blocks and the block_links. */ - list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { - struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; - - list_for_each_safe(elem_ref_to, tmp_ref_to, - &b_all->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); + list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list, + all_blocks_node) { + struct btrfsic_block_link *l, *tmp; + list_for_each_entry_safe(l, tmp, &b_all->ref_to_list, + node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) btrfsic_print_rem_link(state, l); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5b8e235c4b6d..769e0ff1b4ce 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1555,7 +1555,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } - search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + search_start = buf->start & ~((u64)SZ_1G - 1); if (parent) btrfs_set_lock_blocking(parent); @@ -2248,7 +2248,6 @@ static void reada_for_search(struct btrfs_root *root, u64 target; u64 nread = 0; u64 gen; - int direction = path->reada; struct extent_buffer *eb; u32 nr; u32 blocksize; @@ -2276,16 +2275,16 @@ static void reada_for_search(struct btrfs_root *root, nr = slot; while (1) { - if (direction < 0) { + if (path->reada == READA_BACK) { if (nr == 0) break; nr--; - } else if (direction > 0) { + } else if (path->reada == READA_FORWARD) { nr++; if (nr >= nritems) break; } - if (path->reada < 0 && objectid) { + if (path->reada == READA_BACK && objectid) { btrfs_node_key(node, &disk_key, nr); if (btrfs_disk_key_objectid(&disk_key) != objectid) break; @@ -2493,7 +2492,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); free_extent_buffer(tmp); - if (p->reada) + if (p->reada != READA_NONE) reada_for_search(root, p, level, slot, key->objectid); btrfs_release_path(p); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b7e4e344e8e0..97ad9bbeb35d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ #include <linux/btrfs.h> #include <linux/workqueue.h> #include <linux/security.h> +#include <linux/sizes.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -96,6 +97,9 @@ struct btrfs_ordered_sum; /* for storing items that use the BTRFS_UUID_KEY* types */ #define BTRFS_UUID_TREE_OBJECTID 9ULL +/* tracks free space in block groups. */ +#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL + /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL @@ -174,7 +178,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4 }; +static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -196,9 +200,9 @@ static int btrfs_csum_sizes[] = { 4 }; /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) -#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) +#define BTRFS_DIRTY_METADATA_THRESH SZ_32M -#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024) +#define BTRFS_MAX_EXTENT_SIZE SZ_128M /* * The key defines the order in the tree, and so it also defines (optimal) @@ -500,6 +504,8 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) + #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) @@ -526,7 +532,10 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL + +#define BTRFS_FEATURE_COMPAT_RO_SUPP \ + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE) + #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL @@ -590,14 +599,15 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ +enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ - int locks[BTRFS_MAX_LEVEL]; - int reada; + u8 locks[BTRFS_MAX_LEVEL]; + u8 reada; /* keep some upper locks as we walk down */ - int lowest_level; + u8 lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks @@ -1088,6 +1098,13 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_free_space_info { + __le32 extent_count; + __le32 flags; +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) + #define BTRFS_QGROUP_LEVEL_SHIFT 48 static inline u64 btrfs_qgroup_level(u64 qgroupid) { @@ -1296,6 +1313,9 @@ struct btrfs_caching_control { atomic_t count; }; +/* Once caching_thread() finds this much free space, it will wake up waiters. */ +#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2) + struct btrfs_io_ctl { void *cur, *orig; struct page *page; @@ -1321,8 +1341,20 @@ struct btrfs_block_group_cache { u64 delalloc_bytes; u64 bytes_super; u64 flags; - u64 sectorsize; u64 cache_generation; + u32 sectorsize; + + /* + * If the free space extent count exceeds this number, convert the block + * group to bitmaps. + */ + u32 bitmap_high_thresh; + + /* + * If the free space extent count drops below this number, convert the + * block group back to extents. + */ + u32 bitmap_low_thresh; /* * It is just used for the delayed data space allocation because @@ -1378,6 +1410,15 @@ struct btrfs_block_group_cache { struct list_head io_list; struct btrfs_io_ctl io_ctl; + + /* Lock for free space tree operations. */ + struct mutex free_space_lock; + + /* + * Does the block group need to be added to the free space tree? + * Protected by free_space_lock. + */ + int needs_free_space; }; /* delayed seq elem */ @@ -1429,6 +1470,7 @@ struct btrfs_fs_info { struct btrfs_root *csum_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; + struct btrfs_root *free_space_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1816,6 +1858,8 @@ struct btrfs_fs_info { * and will be latter freed. Protected by fs_info->chunk_mutex. */ struct list_head pinned_chunks; + + int creating_free_space_tree; }; struct btrfs_subvolume_writers { @@ -2092,6 +2136,27 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_BLOCK_GROUP_ITEM_KEY 192 +/* + * Every block group is represented in the free space tree by a free space info + * item, which stores some accounting information. It is keyed on + * (block_group_start, FREE_SPACE_INFO, block_group_length). + */ +#define BTRFS_FREE_SPACE_INFO_KEY 198 + +/* + * A free space extent tracks an extent of space that is free in a block group. + * It is keyed on (start, FREE_SPACE_EXTENT, length). + */ +#define BTRFS_FREE_SPACE_EXTENT_KEY 199 + +/* + * When a block group becomes very fragmented, we convert it to use bitmaps + * instead of extents. A free space bitmap is keyed on + * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with + * (length / sectorsize) bits. + */ +#define BTRFS_FREE_SPACE_BITMAP_KEY 200 + #define BTRFS_DEV_EXTENT_KEY 204 #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 @@ -2184,6 +2249,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) #define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) +#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2506,6 +2572,11 @@ BTRFS_SETGET_FUNCS(disk_block_group_flags, BTRFS_SETGET_STACK_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); +/* struct btrfs_free_space_info */ +BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, + extent_count, 32); +BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); + /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); @@ -3573,6 +3644,9 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, const u64 type); +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end); + /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3737,6 +3811,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->csum_root); kfree(fs_info->quota_root); kfree(fs_info->uuid_root); + kfree(fs_info->free_space_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); security_free_mnt_opts(&fs_info->security_opts); @@ -3906,7 +3981,6 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; - int wait; int delay_iput; struct completion completion; struct list_head list; @@ -3914,7 +3988,7 @@ struct btrfs_delalloc_work { }; struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int wait, int delay_iput); + int delay_iput); void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, @@ -4253,16 +4327,98 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, } } +#define btrfs_clear_fs_incompat(__fs_info, opt) \ + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, "clearing %llu feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + #define btrfs_fs_incompat(fs_info, opt) \ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) -static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; disk_super = fs_info->super_copy; return !!(btrfs_super_incompat_flags(disk_super) & flag); } +#define btrfs_set_fs_compat_ro(__fs_info, opt) \ + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, "setting %llu ro feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, "clearing %llu ro feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_fs_compat_ro(fs_info, opt) \ + __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_compat_ro_flags(disk_super) & flag); +} + /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index e0941fbb913c..0be47e4b8136 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -54,16 +54,11 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; atomic_set(&delayed_node->refs, 0); - delayed_node->count = 0; - delayed_node->flags = 0; delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); - delayed_node->index_cnt = 0; INIT_LIST_HEAD(&delayed_node->n_list); INIT_LIST_HEAD(&delayed_node->p_list); - delayed_node->bytes_reserved = 0; - memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item)); } static inline int btrfs_is_continuous_delayed_item( @@ -132,7 +127,7 @@ again: if (node) return node; - node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); if (!node) return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e06dd75ad13f..914ac13bd92f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -493,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, memcpy(&existing_ref->extent_op->key, &ref->extent_op->key, sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = 1; + existing_ref->extent_op->update_key = true; } if (ref->extent_op->update_flags) { existing_ref->extent_op->flags_to_set |= ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = 1; + existing_ref->extent_op->update_flags = true; } btrfs_free_delayed_extent_op(ref->extent_op); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 00ed02cbf3e9..c24b653c7343 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -75,11 +75,11 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; + u8 level; + bool update_key; + bool update_flags; + bool is_data; u64 flags_to_set; - int level; - unsigned int update_key:1; - unsigned int update_flags:1; - unsigned int is_data:1; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 42a378a4eefb..e99ccd6ffb2c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" +#include "free-space-tree.h" #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" @@ -362,7 +363,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - 0, &cached_state); + &cached_state); if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; @@ -1650,6 +1651,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, if (location->objectid == BTRFS_UUID_TREE_OBJECTID) return fs_info->uuid_root ? fs_info->uuid_root : ERR_PTR(-ENOENT); + if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + return fs_info->free_space_root ? fs_info->free_space_root : + ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { @@ -2148,6 +2152,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->uuid_root); if (chunk_root) free_root_extent_buffers(info->chunk_root); + free_root_extent_buffers(info->free_space_root); } void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) @@ -2448,6 +2453,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info, fs_info->uuid_root = root; } + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->free_space_root = root; + } + return 0; } @@ -2668,6 +2682,7 @@ int open_ctree(struct super_block *sb, if (btrfs_check_super_csum(bh->b_data)) { printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); err = -EINVAL; + brelse(bh); goto fail_alloc; } @@ -2809,7 +2824,7 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + SZ_4M / PAGE_CACHE_SIZE); tree_root->nodesize = nodesize; tree_root->sectorsize = sectorsize; @@ -3051,6 +3066,18 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; + if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + pr_info("BTRFS: creating free space tree\n"); + ret = btrfs_create_free_space_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to create free space tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } + down_read(&fs_info->cleanup_work_sem); if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { @@ -3076,6 +3103,18 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); + if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + pr_info("BTRFS: clearing free space tree\n"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to clear free space tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } + if (!fs_info->uuid_root) { pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); @@ -3902,11 +3941,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, return !ret; } -int btrfs_set_buffer_uptodate(struct extent_buffer *buf) -{ - return set_extent_buffer_uptodate(buf); -} - void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_root *root; @@ -3962,7 +3996,6 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, balance_dirty_pages_ratelimited( root->fs_info->btree_inode->i_mapping); } - return; } void btrfs_btree_balance_dirty(struct btrfs_root *root) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index adeb31830b9c..8e79d0070bcf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -19,7 +19,7 @@ #ifndef __DISKIO__ #define __DISKIO__ -#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 #define BTRFS_SUPER_MIRROR_MAX 3 @@ -35,7 +35,7 @@ enum btrfs_wq_endio_type { static inline u64 btrfs_sb_offset(int mirror) { - u64 start = 16 * 1024; + u64 start = SZ_16K; if (mirror) return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); return BTRFS_SUPER_INFO_OFFSET; @@ -116,7 +116,6 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root) void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c4661db2b72a..60cc1399c64f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "raid56.h" #include "locking.h" #include "free-space-cache.h" +#include "free-space-tree.h" #include "math.h" #include "sysfs.h" #include "qgroup.h" @@ -357,8 +358,8 @@ static void fragment_free_space(struct btrfs_root *root, * we need to check the pinned_extents for any extents that can't be used yet * since their free space will be released as soon as the transaction commits. */ -static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end) +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) { u64 extent_start, extent_end, size, total_added = 0; int ret; @@ -395,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, return total_added; } -static noinline void caching_thread(struct btrfs_work *work) +static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group_cache *block_group; struct btrfs_fs_info *fs_info; - struct btrfs_caching_control *caching_ctl; struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -407,17 +407,16 @@ static noinline void caching_thread(struct btrfs_work *work) u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = -ENOMEM; + int ret; bool wakeup = true; - caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; fs_info = block_group->fs_info; extent_root = fs_info->extent_root; path = btrfs_alloc_path(); if (!path) - goto out; + return -ENOMEM; last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -438,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 1; + path->reada = READA_FORWARD; key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; -again: - mutex_lock(&caching_ctl->mutex); - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->commit_root_sem); next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto err; + goto out; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -477,12 +472,14 @@ next: up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + goto next; } ret = btrfs_next_leaf(extent_root, path); if (ret < 0) - goto err; + goto out; if (ret) break; leaf = path->nodes[0]; @@ -521,7 +518,7 @@ next: else last = key.objectid + key.offset; - if (total_found > (1024 * 1024 * 2)) { + if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; if (wakeup) wake_up(&caching_ctl->wait); @@ -534,9 +531,37 @@ next: total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); + caching_ctl->progress = (u64)-1; + +out: + btrfs_free_path(path); + return ret; +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; + int ret; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; + + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + ret = load_free_space_tree(caching_ctl); + else + ret = load_extent_tree_free(caching_ctl); + spin_lock(&block_group->lock); block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_FINISHED; + block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); #ifdef CONFIG_BTRFS_DEBUG @@ -555,20 +580,11 @@ next: #endif caching_ctl->progress = (u64)-1; -err: - btrfs_free_path(path); - up_read(&fs_info->commit_root_sem); - - free_excluded_extents(extent_root, block_group); + up_read(&fs_info->commit_root_sem); + free_excluded_extents(fs_info->extent_root, block_group); mutex_unlock(&caching_ctl->mutex); -out: - if (ret) { - spin_lock(&block_group->lock); - block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_ERROR; - spin_unlock(&block_group->lock); - } + wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); @@ -680,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } else { /* - * We are not going to do the fast caching, set cached to the - * appropriate value and wakeup any waiters. + * We're either using the free space tree or no caching at all. + * Set cached to the appropriate value and wakeup any waiters. */ spin_lock(&cache->lock); if (load_cache_only) { @@ -2115,7 +2131,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, @@ -2141,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, @@ -2254,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 1); @@ -2910,6 +2926,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (trans->aborted) return 0; + if (root->fs_info->creating_free_space_tree) + return 0; + if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; @@ -2988,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return -ENOMEM; extent_op->flags_to_set = flags; - extent_op->update_flags = 1; - extent_op->update_key = 0; - extent_op->is_data = is_data ? 1 : 0; + extent_op->update_flags = true; + extent_op->update_key = false; + extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, @@ -3328,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, * If this block group is smaller than 100 megs don't bother caching the * block group. */ - if (block_group->key.offset < (100 * 1024 * 1024)) { + if (block_group->key.offset < (100 * SZ_1M)) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -3428,7 +3447,7 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, SZ_256M); if (!num_pages) num_pages = 1; @@ -3684,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return -ENOMEM; /* - * We don't need the lock here since we are protected by the transaction - * commit. We want to do the cache_save_setup first and then run the + * Even though we are in the critical section of the transaction commit, + * we can still have concurrent tasks adding elements to this + * transaction's list of dirty block groups. These tasks correspond to + * endio free space workers started when writeback finishes for a + * space cache, which run inode.c:btrfs_finish_ordered_io(), and can + * allocate new block groups as a result of COWing nodes of the root + * tree when updating the free space inode. The writeback for the space + * caches is triggered by an earlier call to + * btrfs_start_dirty_block_groups() and iterations of the following + * loop. + * Also we want to do the cache_save_setup first and then run the * delayed refs to make sure we have the best chance at doing this all * in one shot. */ + spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, @@ -3700,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * finish and then do it all again */ if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->io_list); btrfs_wait_cache_io(root, trans, cache, &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } /* @@ -3712,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * on any pending IO */ list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); should_put = 1; cache_save_setup(cache, trans, path); @@ -3736,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + /* + * One of the free space endio workers might have + * created a new block group while updating a free space + * cache's inode (at inode.c:btrfs_finish_ordered_io()) + * and hasn't released its transaction handle yet, in + * which case the new block group is still attached to + * its transaction handle and its creation has not + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a + * a very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = write_one_cache_group(trans, root, path, + cache); + } if (ret) btrfs_abort_transaction(trans, root, ret); } @@ -3743,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } + spin_unlock(&cur_trans->dirty_bgs_lock); while (!list_empty(io)) { cache = list_first_entry(io, struct btrfs_block_group_cache, @@ -4239,14 +4292,13 @@ static int should_alloc_chunk(struct btrfs_root *root, */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - thresh = max_t(u64, 64 * 1024 * 1024, - div_factor_fine(thresh, 1)); + thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); if (num_bytes - num_allocated < thresh) return 1; } - if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) + if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) return 0; return 1; } @@ -4446,7 +4498,7 @@ out: * transaction. */ if (trans->can_flush_pending_bgs && - trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + trans->chunk_bytes_reserved >= (u64)SZ_2M) { btrfs_create_pending_block_groups(trans, trans->root); btrfs_trans_release_chunk_metadata(trans); } @@ -4544,7 +4596,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) return nr; } -#define EXTENT_SIZE_PER_ITEM (256 * 1024) +#define EXTENT_SIZE_PER_ITEM SZ_256K /* * shrink metadata reservation for delalloc @@ -4749,8 +4801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, u64 expected; u64 to_reclaim; - to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, - 16 * 1024 * 1024); + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, BTRFS_RESERVE_FLUSH_ALL)) { @@ -4761,8 +4812,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; - if (can_overcommit(root, space_info, 1024 * 1024, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -5318,7 +5368,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); + block_rsv->size = min_t(u64, num_bytes, SZ_512M); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -6222,11 +6272,11 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, return ret; if (ssd) - *empty_cluster = 2 * 1024 * 1024; + *empty_cluster = SZ_2M; if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { ret = &root->fs_info->meta_alloc_cluster; if (!ssd) - *empty_cluster = 64 * 1024; + *empty_cluster = SZ_64K; } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { ret = &root->fs_info->data_alloc_cluster; } @@ -6438,7 +6488,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -6661,6 +6711,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + ret = add_to_free_space_tree(trans, root->fs_info, bytenr, + num_bytes); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); @@ -7672,6 +7729,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + ins->offset); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7752,6 +7814,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + num_bytes); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7834,7 +7901,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); - btrfs_set_buffer_uptodate(buf); + set_extent_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; @@ -7980,12 +8047,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; - if (skinny_metadata) - extent_op->update_key = 0; - else - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; + extent_op->update_key = skinny_metadata ? false : true; + extent_op->update_flags = true; + extent_op->is_data = false; extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, @@ -9124,7 +9188,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) if ((sinfo->flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && !force) - min_allocable_bytes = 1 * 1024 * 1024; + min_allocable_bytes = SZ_1M; else min_allocable_bytes = 0; @@ -9656,6 +9720,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) cache->full_stripe_len = btrfs_full_stripe_len(root, &root->fs_info->mapping_tree, start); + set_free_space_tree_thresholds(cache); + atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); @@ -9667,6 +9733,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); + mutex_init(&cache->free_space_lock); return cache; } @@ -9691,7 +9758,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_test_opt(root, SPACE_CACHE) && @@ -9877,6 +9944,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); + add_block_group_free_space(trans, root->fs_info, block_group); + /* already aborted the transaction if it failed. */ next: list_del_init(&block_group->bg_list); } @@ -9907,6 +9976,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->needs_free_space = 1; ret = exclude_super_stripes(root, cache); if (ret) { /* @@ -10277,6 +10347,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, unlock_chunks(root); + ret = remove_block_group_free_space(trans, root->fs_info, block_group); + if (ret) + goto out; + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/fs/btrfs/extent-tree.h +++ /dev/null diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9abe18763a7f..2e7c97a3f344 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1285,20 +1285,6 @@ search_again: } /* wrappers around set/clear extent bit */ -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, - NULL, mask); -} - -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask) -{ - return set_extent_bit(tree, start, end, bits, NULL, - NULL, mask); -} - int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset) @@ -1323,17 +1309,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, cached, mask, NULL); } -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask) -{ - int wake = 0; - - if (bits & EXTENT_LOCKED) - wake = 1; - - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); -} - int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset) @@ -1348,63 +1323,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, changeset); } -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); -} - -int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, mask); -} - -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); -} - -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, - NULL, mask); -} - -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, - cached_state, mask); -} - -int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, mask); -} - /* * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached_state) + struct extent_state **cached_state) { int err; u64 failed_start; while (1) { - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, &failed_start, cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { @@ -1417,11 +1347,6 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, 0, NULL); -} - int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; @@ -1438,20 +1363,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) return 1; } -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - mask); -} - -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - GFP_NOFS); -} - -int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1464,10 +1376,9 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } -int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1481,13 +1392,12 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } /* * helper function to set both pages and extents in the tree writeback */ -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1500,7 +1410,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } /* find the first state struct with 'bits' set after 'start', and @@ -1800,7 +1709,7 @@ again: BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); + lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -1820,7 +1729,7 @@ out_failed: return found; } -int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, unsigned clear_bits, unsigned long page_ops) @@ -1835,7 +1744,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (page_ops == 0) - return 0; + return; if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) mapping_set_error(inode->i_mapping, -EIO); @@ -1869,7 +1778,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, index += ret; cond_resched(); } - return 0; } /* @@ -2516,7 +2424,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, /* lots and lots of room for performance fixes in the end_bio funcs */ -int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); struct extent_io_tree *tree; @@ -2537,7 +2445,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) ret = ret < 0 ? ret : -EIO; mapping_set_error(page->mapping, ret); } - return 0; } /* @@ -2579,9 +2486,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (end_extent_writepage(page, bio->bi_error, start, end)) - continue; - + end_extent_writepage(page, bio->bi_error, start, end); end_page_writeback(page); } @@ -4326,7 +4231,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state); + lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -4387,7 +4292,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 end = start + PAGE_CACHE_SIZE - 1; if (gfpflags_allow_blocking(mask) && - page->mapping->host->i_size > 16 * 1024 * 1024) { + page->mapping->host->i_size > SZ_16M) { u64 len; while (start <= end) { len = end - start + 1; @@ -4536,7 +4441,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, @@ -4797,24 +4702,14 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) return new; } -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) { struct extent_buffer *eb; - unsigned long len; unsigned long num_pages; unsigned long i; - if (!fs_info) { - /* - * Called only from tests that don't always have a fs_info - * available, but we know that nodesize is 4096 - */ - len = 4096; - } else { - len = fs_info->tree_root->nodesize; - } - num_pages = num_extent_pages(0, len); + num_pages = num_extent_pages(start, len); eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) @@ -4837,6 +4732,24 @@ err: return NULL; } +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + unsigned long len; + + if (!fs_info) { + /* + * Called only from tests that don't always have a fs_info + * available, but we know that nodesize is 4096 + */ + len = 4096; + } else { + len = fs_info->tree_root->nodesize; + } + + return __alloc_dummy_extent_buffer(fs_info, start, len); +} + static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; @@ -5227,7 +5140,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) return was_dirty; } -int clear_extent_buffer_uptodate(struct extent_buffer *eb) +void clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; @@ -5240,10 +5153,9 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb) if (page) ClearPageUptodate(page); } - return 0; } -int set_extent_buffer_uptodate(struct extent_buffer *eb) +void set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; @@ -5255,7 +5167,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb) page = eb->pages[i]; SetPageUptodate(page); } - return 0; } int extent_buffer_uptodate(struct extent_buffer *eb) @@ -5594,6 +5505,155 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } +/* + * The extent buffer bitmap operations are done with byte granularity because + * bitmap items are not guaranteed to be aligned to a word and therefore a + * single word in a bitmap may straddle two pages in the extent buffer. + */ +#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) +#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BITMAP_FIRST_BYTE_MASK(start) \ + ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) +#define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +/* + * eb_bitmap_offset() - calculate the page and offset of the byte containing the + * given bit number + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number + * @page_index: return index of the page in the extent buffer that contains the + * given bit number + * @page_offset: return offset into the page given by page_index + * + * This helper hides the ugliness of finding the byte in an extent buffer which + * contains a given bit. + */ +static inline void eb_bitmap_offset(struct extent_buffer *eb, + unsigned long start, unsigned long nr, + unsigned long *page_index, + size_t *page_offset) +{ + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t byte_offset = BIT_BYTE(nr); + size_t offset; + + /* + * The byte we want is the offset of the extent buffer + the offset of + * the bitmap item in the extent buffer + the offset of the byte in the + * bitmap item. + */ + offset = start_offset + start + byte_offset; + + *page_index = offset >> PAGE_CACHE_SHIFT; + *page_offset = offset & (PAGE_CACHE_SIZE - 1); +} + +/** + * extent_buffer_test_bit - determine whether a bit in a bitmap item is set + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number to test + */ +int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + unsigned long nr) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + + eb_bitmap_offset(eb, start, nr, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); +} + +/** + * extent_buffer_bitmap_set - set an area of a bitmap + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to set + */ +void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + const unsigned int size = pos + len; + int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); + unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); + + eb_bitmap_offset(eb, start, pos, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + + while (len >= bits_to_set) { + kaddr[offset] |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~0U; + if (++offset >= PAGE_CACHE_SIZE && len > 0) { + offset = 0; + page = eb->pages[++i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + } + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + kaddr[offset] |= mask_to_set; + } +} + + +/** + * extent_buffer_bitmap_clear - clear an area of a bitmap + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to clear + */ +void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + const unsigned int size = pos + len; + int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); + unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); + + eb_bitmap_offset(eb, start, pos, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + + while (len >= bits_to_clear) { + kaddr[offset] &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_BYTE; + mask_to_clear = ~0U; + if (++offset >= PAGE_CACHE_SIZE && len > 0) { + offset = 0; + page = eb->pages[++i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + } + } + if (len) { + mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); + kaddr[offset] &= ~mask_to_clear; + } +} + static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) { unsigned long distance = (src > dst) ? src - dst : dst - src; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f4c1ae11855f..0377413bd4b9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -199,12 +199,14 @@ int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached); -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask); + struct extent_state **cached); + +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return lock_extent_bits(tree, start, end, NULL); +} + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); @@ -221,39 +223,105 @@ void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int filled, struct extent_state *cached_state); -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask); + +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + GFP_NOFS); +} + +static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + +static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits, gfp_t mask) +{ + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); +} + int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); + +static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); +} + +static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, mask); +} + +static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, + NULL, mask); +} + +static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); +} + int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask); -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); + +static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE, + NULL, cached_state, mask); +} + +static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); +} + +static inline int set_extent_new(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); +} + +static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + cached_state, mask); +} + int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state); @@ -282,8 +350,10 @@ void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -328,19 +398,25 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); +int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + unsigned long pos); +void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len); +void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len); void clear_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_buffer *eb); +void set_extent_buffer_uptodate(struct extent_buffer *eb); +void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, unsigned long *map_len); -int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); +void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); @@ -357,7 +433,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, int mirror_num); int clean_io_failure(struct inode *inode, u64 start, struct page *page, unsigned int pg_offset); -int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 58ece6558430..a67e1c828d0f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -202,7 +202,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, } if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) - path->reada = 2; + path->reada = READA_FORWARD; WARN_ON(bio->bi_vcnt <= 0); @@ -328,7 +328,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (search_commit) { path->skip_locking = 1; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e3d9022bfd4e..83d7859d7619 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1394,7 +1394,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos, 0, cached_state); + start_pos, last_pos, cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && @@ -2398,7 +2398,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache_range(inode, lockstart, lockend); lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, &cached_state); + &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, lockend); /* @@ -2705,7 +2705,7 @@ static long btrfs_fallocate(struct file *file, int mode, * transaction */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state); + locked_end, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -2852,7 +2852,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) lockend--; len = lockend - lockstart + 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); while (start < inode->i_size) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cfe99bec49de..8f835bfa1bdd 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -30,7 +30,7 @@ #include "volumes.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) -#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) +#define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { u64 start; @@ -1086,14 +1086,11 @@ write_pinned_extent_entries(struct btrfs_root *root, static noinline_for_stack int write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { - struct list_head *pos, *n; + struct btrfs_free_space *entry, *next; int ret; /* Write out the bitmaps */ - list_for_each_safe(pos, n, bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - + list_for_each_entry_safe(entry, next, bitmap_list, list) { ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) return -ENOSPC; @@ -1119,13 +1116,10 @@ static int flush_dirty_cache(struct inode *inode) static void noinline_for_stack cleanup_bitmap_list(struct list_head *bitmap_list) { - struct list_head *pos, *n; + struct btrfs_free_space *entry, *next; - list_for_each_safe(pos, n, bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); + list_for_each_entry_safe(entry, next, bitmap_list, list) list_del_init(&entry->list); - } } static void noinline_for_stack @@ -1261,7 +1255,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, goto out; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state); + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1656,11 +1650,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * at or below 32k, so we need to adjust how much memory we allow to be * used by extent based free space tracking */ - if (size < 1024 * 1024 * 1024) + if (size < SZ_1G) max_bytes = MAX_CACHE_BYTES_PER_GIG; else - max_bytes = MAX_CACHE_BYTES_PER_GIG * - div_u64(size, 1024 * 1024 * 1024); + max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); /* * we want to account for 1 more bitmap than what we have so we can make @@ -2016,7 +2009,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_space_op = { +static const struct btrfs_free_space_op free_space_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -2489,8 +2482,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) * track of free space, and if we pass 1/2 of that we want to * start converting things over to using bitmaps */ - ctl->extents_thresh = ((1024 * 32) / 2) / - sizeof(struct btrfs_free_space); + ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); } /* diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index f251865eb6f3..33178c490ace 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -37,7 +37,7 @@ struct btrfs_free_space_ctl { int total_bitmaps; int unit; u64 start; - struct btrfs_free_space_op *op; + const struct btrfs_free_space_op *op; void *private; struct mutex cache_writeout_mutex; struct list_head trimming_ranges; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c new file mode 100644 index 000000000000..393e36bd5845 --- /dev/null +++ b/fs/btrfs/free-space-tree.c @@ -0,0 +1,1591 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include "ctree.h" +#include "disk-io.h" +#include "locking.h" +#include "free-space-tree.h" +#include "transaction.h" + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) +{ + u32 bitmap_range; + size_t bitmap_size; + u64 num_bitmaps, total_bitmap_size; + + /* + * We convert to bitmaps when the disk space required for using extents + * exceeds that required for using bitmaps. + */ + bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1, + bitmap_range); + bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE; + total_bitmap_size = num_bitmaps * bitmap_size; + cache->bitmap_high_thresh = div_u64(total_bitmap_size, + sizeof(struct btrfs_item)); + + /* + * We allow for a small buffer between the high threshold and low + * threshold to avoid thrashing back and forth between the two formats. + */ + if (cache->bitmap_high_thresh > 100) + cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100; + else + cache->bitmap_low_thresh = 0; +} + +static int add_new_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key; + struct extent_buffer *leaf; + int ret; + + key.objectid = block_group->key.objectid; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->key.offset; + + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); + if (ret) + goto out; + + leaf = path->nodes[0]; + info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_info); + btrfs_set_free_space_extent_count(leaf, info, 0); + btrfs_set_free_space_flags(leaf, info, 0); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + +struct btrfs_free_space_info * +search_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + int ret; + + key.objectid = block_group->key.objectid; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->key.offset; + + ret = btrfs_search_slot(trans, root, &key, path, 0, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret != 0) { + btrfs_warn(fs_info, "missing free space info for %llu\n", + block_group->key.objectid); + ASSERT(0); + return ERR_PTR(-ENOENT); + } + + return btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_free_space_info); +} + +/* + * btrfs_search_slot() but we're looking for the greatest key less than the + * passed key. + */ +static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow) +{ + int ret; + + ret = btrfs_search_slot(trans, root, key, p, ins_len, cow); + if (ret < 0) + return ret; + + if (ret == 0) { + ASSERT(0); + return -EIO; + } + + if (p->slots[0] == 0) { + ASSERT(0); + return -EIO; + } + p->slots[0]--; + + return 0; +} + +static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) +{ + return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); +} + +static unsigned long *alloc_bitmap(u32 bitmap_size) +{ + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} + +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + char *bitmap_cursor; + u64 start, end; + u64 bitmap_range, i; + u32 bitmap_size, flags, expected_extent_count; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(block_group->key.offset, + block_group->sectorsize); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { + u64 first, last; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + first = div_u64(found_key.objectid - start, + block_group->sectorsize); + last = div_u64(found_key.objectid + found_key.offset - start, + block_group->sectorsize); + bitmap_set(bitmap, first, last - first); + + extent_count++; + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags |= BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + bitmap_cursor = (char *)bitmap; + bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + i = start; + while (i < end) { + unsigned long ptr; + u64 extent_size; + u32 data_size; + + extent_size = min(end - i, bitmap_range); + data_size = free_space_bitmap_size(extent_size, + block_group->sectorsize); + + key.objectid = i; + key.type = BTRFS_FREE_SPACE_BITMAP_KEY; + key.offset = extent_size; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + data_size); + if (ret) + goto out; + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + i += extent_size; + bitmap_cursor += data_size; + } + + ret = 0; +out: + vfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + u64 start, end; + /* Initialize to silence GCC. */ + u64 extent_start = 0; + u64 offset; + u32 bitmap_size, flags, expected_extent_count; + int prev_bit = 0, bit, bitnr; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(block_group->key.offset, + block_group->sectorsize); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + unsigned long ptr; + char *bitmap_cursor; + u32 bitmap_pos, data_size; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + bitmap_pos = div_u64(found_key.objectid - start, + block_group->sectorsize * + BITS_PER_BYTE); + bitmap_cursor = ((char *)bitmap) + bitmap_pos; + data_size = free_space_bitmap_size(found_key.offset, + block_group->sectorsize); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); + read_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + offset = start; + bitnr = 0; + while (offset < end) { + bit = !!test_bit(bitnr, bitmap); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + key.objectid = extent_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = offset - extent_start; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + btrfs_release_path(path); + + extent_count++; + } + prev_bit = bit; + offset += block_group->sectorsize; + bitnr++; + } + if (prev_bit == 1) { + key.objectid = extent_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = end - extent_start; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + btrfs_release_path(path); + + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + ret = 0; +out: + vfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +static int update_free_space_extent_count(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + int new_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + u32 extent_count; + int ret = 0; + + if (new_extents == 0) + return 0; + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + extent_count += new_extents; + btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(path); + + if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count > block_group->bitmap_high_thresh) { + ret = convert_free_space_to_bitmaps(trans, fs_info, block_group, + path); + } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count < block_group->bitmap_low_thresh) { + ret = convert_free_space_to_extents(trans, fs_info, block_group, + path); + } + +out: + return ret; +} + +int free_space_test_bit(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 offset) +{ + struct extent_buffer *leaf; + struct btrfs_key key; + u64 found_start, found_end; + unsigned long ptr, i; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(offset >= found_start && offset < found_end); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + i = div_u64(offset - found_start, block_group->sectorsize); + return !!extent_buffer_test_bit(leaf, ptr, i); +} + +static void free_space_set_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 *start, u64 *size, + int bit) +{ + struct extent_buffer *leaf; + struct btrfs_key key; + u64 end = *start + *size; + u64 found_start, found_end; + unsigned long ptr, first, last; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(*start >= found_start && *start < found_end); + ASSERT(end > found_start); + + if (end > found_end) + end = found_end; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + first = div_u64(*start - found_start, block_group->sectorsize); + last = div_u64(end - found_start, block_group->sectorsize); + if (bit) + extent_buffer_bitmap_set(leaf, ptr, first, last - first); + else + extent_buffer_bitmap_clear(leaf, ptr, first, last - first); + btrfs_mark_buffer_dirty(leaf); + + *size -= end - *start; + *start = end; +} + +/* + * We can't use btrfs_next_item() in modify_free_space_bitmap() because + * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy + * tree walking in btrfs_next_leaf() anyways because we know exactly what we're + * looking for. + */ +static int free_space_next_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p) +{ + struct btrfs_key key; + + if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) { + p->slots[0]++; + return 0; + } + + btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]); + btrfs_release_path(p); + + key.objectid += key.offset; + key.type = (u8)-1; + key.offset = (u64)-1; + + return btrfs_search_prev_slot(trans, root, &key, p, 0, 1); +} + +/* + * If remove is 1, then we are removing free space, thus clearing bits in the + * bitmap. If remove is 0, then we are adding free space, thus setting bits in + * the bitmap. + */ +static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size, int remove) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + u64 end = start + size; + u64 cur_start, cur_size; + int prev_bit, next_bit; + int new_extents; + int ret; + + /* + * Read the bit for the block immediately before the extent of space if + * that block is within the block group. + */ + if (start > block_group->key.objectid) { + u64 prev_block = start - block_group->sectorsize; + + key.objectid = prev_block; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = free_space_test_bit(block_group, path, prev_block); + + /* The previous block may have been in the previous bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (start >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + } else { + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = -1; + } + + /* + * Iterate over all of the bitmaps overlapped by the extent of space, + * clearing/setting bits as required. + */ + cur_start = start; + cur_size = size; + while (1) { + free_space_set_bits(block_group, path, &cur_start, &cur_size, + !remove); + if (cur_size == 0) + break; + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + /* + * Read the bit for the block immediately after the extent of space if + * that block is within the block group. + */ + if (end < block_group->key.objectid + block_group->key.offset) { + /* The next block may be in the next bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (end >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + next_bit = free_space_test_bit(block_group, path, end); + } else { + next_bit = -1; + } + + if (remove) { + new_extents = -1; + if (prev_bit == 1) { + /* Leftover on the left. */ + new_extents++; + } + if (next_bit == 1) { + /* Leftover on the right. */ + new_extents++; + } + } else { + new_extents = 1; + if (prev_bit == 1) { + /* Merging with neighbor on the left. */ + new_extents--; + } + if (next_bit == 1) { + /* Merging with neighbor on the right. */ + new_extents--; + } + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +static int remove_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = -1; + int ret; + + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(start >= found_start && end <= found_end); + + /* + * Okay, now that we've found the free space extent which contains the + * free space that we are removing, there are four cases: + * + * 1. We're using the whole extent: delete the key we found and + * decrement the free space extent count. + * 2. We are using part of the extent starting at the beginning: delete + * the key we found and insert a new key representing the leftover at + * the end. There is no net change in the number of extents. + * 3. We are using part of the extent ending at the end: delete the key + * we found and insert a new key representing the leftover at the + * beginning. There is no net change in the number of extents. + * 4. We are using part of the extent in the middle: delete the key we + * found and insert two new keys representing the leftovers on each + * side. Where we used to have one extent, we now have two, so increment + * the extent count. We may need to convert the block group to bitmaps + * as a result. + */ + + /* Delete the existing key (cases 1-4). */ + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + /* Add a key for leftovers at the beginning (cases 3 and 4). */ + if (start > found_start) { + key.objectid = found_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = start - found_start; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + /* Add a key for leftovers at the end (cases 2 and 4). */ + if (end < found_end) { + key.objectid = end; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = found_end - end; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (block_group->needs_free_space) { + ret = __add_block_group_free_space(trans, fs_info, block_group, + path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, fs_info, block_group, + path, start, size, 1); + } else { + return remove_free_space_extent(trans, fs_info, block_group, + path, start, size); + } +} + +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __remove_from_free_space_tree(trans, fs_info, block_group, path, + start, size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +static int add_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key, new_key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = 1; + int ret; + + /* + * We are adding a new extent of free space, but we need to merge + * extents. There are four cases here: + * + * 1. The new extent does not have any immediate neighbors to merge + * with: add the new key and increment the free space extent count. We + * may need to convert the block group to bitmaps as a result. + * 2. The new extent has an immediate neighbor before it: remove the + * previous key and insert a new key combining both of them. There is no + * net change in the number of extents. + * 3. The new extent has an immediate neighbor after it: remove the next + * key and insert a new key combining both of them. There is no net + * change in the number of extents. + * 4. The new extent has immediate neighbors on both sides: remove both + * of the keys and insert a new key combining all of them. Where we used + * to have two extents, we now have one, so decrement the extent count. + */ + + new_key.objectid = start; + new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + new_key.offset = size; + + /* Search for a neighbor on the left. */ + if (start == block_group->key.objectid) + goto right; + key.objectid = start - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto right; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->key.objectid && + found_end > block_group->key.objectid); + ASSERT(found_start < start && found_end <= start); + + /* + * Delete the neighbor on the left and absorb it into the new key (cases + * 2 and 4). + */ + if (found_end == start) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.objectid = found_start; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +right: + /* Search for a neighbor on the right. */ + if (end == block_group->key.objectid + block_group->key.offset) + goto insert; + key.objectid = end; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto insert; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->key.objectid && + found_end > block_group->key.objectid); + ASSERT((found_start < start && found_end <= start) || + (found_start >= end && found_end > end)); + + /* + * Delete the neighbor on the right and absorb it into the new key + * (cases 3 and 4). + */ + if (found_start == end) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +insert: + /* Insert the new key (cases 1-4). */ + ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0); + if (ret) + goto out; + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (block_group->needs_free_space) { + ret = __add_block_group_free_space(trans, fs_info, block_group, + path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, fs_info, block_group, + path, start, size, 0); + } else { + return add_free_space_extent(trans, fs_info, block_group, path, + start, size); + } +} + +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start, + size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +/* + * Populate the free space tree by walking the extent tree. Operations on the + * extent tree that happen as a result of writes to the free space tree will go + * through the normal add/remove hooks. + */ +static int populate_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path, *path2; + struct btrfs_key key; + u64 start, end; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + path2 = btrfs_alloc_path(); + if (!path2) { + btrfs_free_path(path); + return -ENOMEM; + } + + ret = add_new_free_space_info(trans, fs_info, block_group, path2); + if (ret) + goto out; + + mutex_lock(&block_group->free_space_lock); + + /* + * Iterate through all of the extent and metadata items in this block + * group, adding the free space between them and the free space at the + * end. Note that EXTENT_ITEM and METADATA_ITEM are less than + * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's + * contained in. + */ + key.objectid = block_group->key.objectid; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); + if (ret < 0) + goto out_locked; + ASSERT(ret == 0); + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + while (1) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + if (key.objectid >= end) + break; + + if (start < key.objectid) { + ret = __add_to_free_space_tree(trans, fs_info, + block_group, + path2, start, + key.objectid - + start); + if (ret) + goto out_locked; + } + start = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + start += fs_info->tree_root->nodesize; + else + start += key.offset; + } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + if (key.objectid != block_group->key.objectid) + break; + } + + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + goto out_locked; + if (ret) + break; + } + if (start < end) { + ret = __add_to_free_space_tree(trans, fs_info, block_group, + path2, start, end - start); + if (ret) + goto out_locked; + } + + ret = 0; +out_locked: + mutex_unlock(&block_group->free_space_lock); +out: + btrfs_free_path(path2); + btrfs_free_path(path); + return ret; +} + +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *free_space_root; + struct btrfs_block_group_cache *block_group; + struct rb_node *node; + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + fs_info->creating_free_space_tree = 1; + free_space_root = btrfs_create_tree(trans, fs_info, + BTRFS_FREE_SPACE_TREE_OBJECTID); + if (IS_ERR(free_space_root)) { + ret = PTR_ERR(free_space_root); + goto abort; + } + fs_info->free_space_root = free_space_root; + + node = rb_first(&fs_info->block_group_cache_tree); + while (node) { + block_group = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + ret = populate_free_space_tree(trans, fs_info, block_group); + if (ret) + goto abort; + node = rb_next(node); + } + + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + fs_info->creating_free_space_tree = 0; + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + return 0; + +abort: + fs_info->creating_free_space_tree = 0; + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); + return ret; +} + +static int clear_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int nr; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + key.objectid = 0; + key.type = 0; + key.offset = 0; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + nr = btrfs_header_nritems(path->nodes[0]); + if (!nr) + break; + + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); + if (ret) + goto out; + + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *free_space_root = fs_info->free_space_root; + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + fs_info->free_space_root = NULL; + + ret = clear_free_space_tree(trans, free_space_root); + if (ret) + goto abort; + + ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key); + if (ret) + goto abort; + + list_del(&free_space_root->dirty_list); + + btrfs_tree_lock(free_space_root->node); + clean_tree_block(trans, tree_root->fs_info, free_space_root->node); + btrfs_tree_unlock(free_space_root->node); + btrfs_free_tree_block(trans, free_space_root, free_space_root->node, + 0, 1); + + free_extent_buffer(free_space_root->node); + free_extent_buffer(free_space_root->commit_root); + kfree(free_space_root); + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + return 0; + +abort: + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); + return ret; +} + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + u64 start, end; + int ret; + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + block_group->needs_free_space = 0; + + ret = add_new_free_space_info(trans, fs_info, block_group, path); + if (ret) + return ret; + + return __add_to_free_space_tree(trans, fs_info, block_group, path, + block_group->key.objectid, + block_group->key.offset); +} + +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_path *path = NULL; + int ret = 0; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + mutex_lock(&block_group->free_space_lock); + if (!block_group->needs_free_space) + goto out; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = __add_block_group_free_space(trans, fs_info, block_group, path); + +out: + btrfs_free_path(path); + mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + u64 start, end; + int done = 0, nr; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + if (block_group->needs_free_space) { + /* We never added this block group to the free space tree. */ + return 0; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + nr++; + path->slots[0]--; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY || + found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + int prev_bit = 0, bit; + /* Initialize to silence GCC. */ + u64 extent_start = 0; + u64 end, offset; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = fs_info->free_space_root; + + end = block_group->key.objectid + block_group->key.offset; + + while (1) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + caching_ctl->progress = key.objectid; + + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(block_group, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + total_found += add_new_free_space(block_group, + fs_info, + extent_start, + offset); + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + prev_bit = bit; + offset += block_group->sectorsize; + } + } + if (prev_bit == 1) { + total_found += add_new_free_space(block_group, fs_info, + extent_start, end); + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + caching_ctl->progress = (u64)-1; + + ret = 0; +out: + return ret; +} + +static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + u64 end; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = fs_info->free_space_root; + + end = block_group->key.objectid + block_group->key.offset; + + while (1) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + caching_ctl->progress = key.objectid; + + total_found += add_new_free_space(block_group, fs_info, + key.objectid, + key.objectid + key.offset); + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + caching_ctl->progress = (u64)-1; + + ret = 0; +out: + return ret; +} + +int load_free_space_tree(struct btrfs_caching_control *caching_ctl) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_free_space_info *info; + struct btrfs_path *path; + u32 extent_count, flags; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Just like caching_thread() doesn't want to deadlock on the extent + * tree, we don't want to deadlock on the free space tree. + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 1; + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + flags = btrfs_free_space_flags(path->nodes[0], info); + + /* + * We left path pointing to the free space info item, so now + * load_free_space_foo can just iterate through the free space tree from + * there. + */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) + ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + else + ret = load_free_space_extents(caching_ctl, path, extent_count); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h new file mode 100644 index 000000000000..54ffced3bce8 --- /dev/null +++ b/fs/btrfs/free-space-tree.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_TREE +#define __BTRFS_FREE_SPACE_TREE + +/* + * The default size for new free space bitmap items. The last bitmap in a block + * group may be truncated, and none of the free space tree code assumes that + * existing bitmaps are this size. + */ +#define BTRFS_FREE_SPACE_BITMAP_SIZE 256 +#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) + +void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group); +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); +int load_free_space_tree(struct btrfs_caching_control *caching_ctl); +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size); +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size); + +/* Exposed for testing. */ +struct btrfs_free_space_info * +search_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow); +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size); +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size); +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +int free_space_test_bit(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 offset); + +#endif diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 767a6056ac45..8b57c17b3fb3 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -48,7 +48,7 @@ static int caching_kthread(void *data) /* Since the commit root is read-only, we can safely skip locking. */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.offset = 0; @@ -282,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) } } -#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space)) +#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space)) #define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) /* @@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_ino_op = { +static const struct btrfs_free_space_op free_ino_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, return false; } -static struct btrfs_free_space_op pinned_free_ino_op = { +static const struct btrfs_free_space_op pinned_free_ino_op = { .recalc_thresholds = pinned_recalc_thresholds, .use_bitmap = pinned_use_bitmap, }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b8856e182ae..247830107686 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,13 @@ struct btrfs_iget_args { struct btrfs_root *root; }; +struct btrfs_dio_data { + u64 outstanding_extents; + u64 reserve; + u64 unsubmitted_oe_range_start; + u64 unsubmitted_oe_range_end; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_dir_ro_inode_operations; @@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; -static struct extent_io_ops btrfs_extent_io_ops; +static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; -static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 -static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, @@ -414,15 +420,15 @@ static noinline void compress_file_range(struct inode *inode, unsigned long nr_pages_ret = 0; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned long max_compressed = 128 * 1024; - unsigned long max_uncompressed = 128 * 1024; + unsigned long max_compressed = SZ_128K; + unsigned long max_uncompressed = SZ_128K; int i; int will_compress; int compress_type = root->fs_info->compress_type; int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ - if ((end - start + 1) < 16 * 1024 && + if ((end - start + 1) < SZ_16K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); @@ -430,7 +436,7 @@ static noinline void compress_file_range(struct inode *inode, again: will_compress = 0; nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; - nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -944,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode, disk_num_bytes = num_bytes; /* if this is a small write inside eof, kick off defrag */ - if (num_bytes < 64 * 1024 && + if (num_bytes < SZ_64K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); @@ -1107,7 +1113,7 @@ static noinline void async_cow_submit(struct btrfs_work *work) * atomic_sub_return implies a barrier for waitqueue_active */ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < - 5 * 1024 * 1024 && + 5 * SZ_1M && waitqueue_active(&root->fs_info->async_submit_wait)) wake_up(&root->fs_info->async_submit_wait); @@ -1132,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; - int limit = 10 * 1024 * 1024; + int limit = 10 * SZ_1M; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); @@ -1148,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, !btrfs_test_opt(root, FORCE_COMPRESS)) cur_end = end; else - cur_end = min(end, start + 512 * 1024 - 1); + cur_end = min(end, start + SZ_512K - 1); async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); @@ -1989,7 +1995,7 @@ again: page_start = page_offset(page); page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ @@ -2482,7 +2488,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, lock_start = backref->file_pos; lock_end = backref->file_pos + backref->num_bytes - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - 0, &cached); + &cached); ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); if (ordered) { @@ -2874,7 +2880,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state); + &cached_state); ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, @@ -3106,55 +3112,47 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, start, (size_t)(end - start + 1)); } -struct delayed_iput { - struct list_head list; - struct inode *inode; -}; - -/* JDM: If this is fs-wide, why can't we add a pointer to - * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct delayed_iput *delayed; + struct btrfs_inode *binode = BTRFS_I(inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); - delayed->inode = inode; - spin_lock(&fs_info->delayed_iput_lock); - list_add_tail(&delayed->list, &fs_info->delayed_iputs); + if (binode->delayed_iput_count == 0) { + ASSERT(list_empty(&binode->delayed_iput)); + list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + } else { + binode->delayed_iput_count++; + } spin_unlock(&fs_info->delayed_iput_lock); } void btrfs_run_delayed_iputs(struct btrfs_root *root) { - LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; - struct delayed_iput *delayed; - int empty; - - spin_lock(&fs_info->delayed_iput_lock); - empty = list_empty(&fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); - if (empty) - return; down_read(&fs_info->delayed_iput_sem); - spin_lock(&fs_info->delayed_iput_lock); - list_splice_init(&fs_info->delayed_iputs, &list); - spin_unlock(&fs_info->delayed_iput_lock); - - while (!list_empty(&list)) { - delayed = list_entry(list.next, struct delayed_iput, list); - list_del(&delayed->list); - iput(delayed->inode); - kfree(delayed); + while (!list_empty(&fs_info->delayed_iputs)) { + struct btrfs_inode *inode; + + inode = list_first_entry(&fs_info->delayed_iputs, + struct btrfs_inode, delayed_iput); + if (inode->delayed_iput_count) { + inode->delayed_iput_count--; + list_move_tail(&inode->delayed_iput, + &fs_info->delayed_iputs); + } else { + list_del_init(&inode->delayed_iput); + } + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + spin_lock(&fs_info->delayed_iput_lock); } - + spin_unlock(&fs_info->delayed_iput_lock); up_read(&root->fs_info->delayed_iput_sem); } @@ -3351,7 +3349,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = -ENOMEM; goto out; } - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -4318,7 +4316,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; /* * We want to drop from the next block forward in case this new size is @@ -4349,7 +4347,7 @@ search_again: * up a huge file in a single leaf. Most of the time that * bytes_deleted is > 0, it will be huge by the time we get here */ - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { if (btrfs_should_end_transaction(trans, root)) { err = -EAGAIN; goto error; @@ -4592,7 +4590,7 @@ error: btrfs_free_path(path); - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { unsigned long updates = trans->delayed_ref_updates; if (updates) { trans->delayed_ref_updates = 0; @@ -4669,7 +4667,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, page_start, page_end, &cached_state); set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, page_start); @@ -4800,7 +4798,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(io_tree, hole_start, block_end - 1, 0, + lock_extent_bits(io_tree, hole_start, block_end - 1, &cached_state); ordered = btrfs_lookup_ordered_range(inode, hole_start, block_end - hole_start); @@ -5112,7 +5110,7 @@ static void evict_inode_truncate_pages(struct inode *inode) end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, start, end, 0, &cached_state); + lock_extent_bits(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5305,7 +5303,6 @@ void btrfs_evict_inode(struct inode *inode) no_delete: btrfs_remove_delayed_node(inode); clear_inode(inode); - return; } /* @@ -5754,7 +5751,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); @@ -6482,7 +6479,7 @@ out_unlock_inode: static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); u64 index; @@ -6508,6 +6505,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; goto fail; } @@ -6541,9 +6539,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, inode, NULL, parent); } - btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); fail: + if (trans) + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -6688,7 +6687,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, } static noinline int uncompress_inline(struct btrfs_path *path, - struct inode *inode, struct page *page, + struct page *page, size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { @@ -6785,7 +6784,7 @@ again: * Chances are we'll be called again, so go ahead and do * readahead */ - path->reada = 1; + path->reada = READA_FORWARD; } ret = btrfs_lookup_file_extent(trans, root, path, @@ -6884,8 +6883,7 @@ next: if (create == 0 && !PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, inode, page, - pg_offset, + ret = uncompress_inline(path, page, pg_offset, extent_offset, item); if (ret) { err = ret; @@ -7381,7 +7379,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, cached_state); + cached_state); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -7409,25 +7407,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } else { - /* Screw you mmap */ - ret = btrfs_fdatawrite_range(inode, lockstart, lockend); - if (ret) - break; - ret = filemap_fdatawait_range(inode->i_mapping, - lockstart, - lockend); - if (ret) - break; - /* - * If we found a page that couldn't be invalidated just - * fall back to buffered. + * We could trigger writeback for this range (and wait + * for it to complete) and then invalidate the pages for + * this range (through invalidate_inode_pages2_range()), + * but that can lead us to a deadlock with a concurrent + * call to readpages() (a buffered read or a defrag call + * triggered a readahead) on a page lock due to an + * ordered dio extent we created before but did not have + * yet a corresponding bio submitted (whence it can not + * complete), which makes readpages() wait for that + * ordered extent to complete while holding a lock on + * that page. */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - lockstart >> PAGE_CACHE_SHIFT, - lockend >> PAGE_CACHE_SHIFT); - if (ret) - break; + ret = -ENOTBLK; + break; } cond_resched(); @@ -7483,11 +7477,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return em; } -struct btrfs_dio_data { - u64 outstanding_extents; - u64 reserve; -}; - static void adjust_dio_outstanding_extents(struct inode *inode, struct btrfs_dio_data *dio_data, const u64 len) @@ -7671,6 +7660,7 @@ unlock: btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; current->journal_info = dio_data; } @@ -7993,22 +7983,22 @@ static void btrfs_endio_direct_read(struct bio *bio) bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio) +static void btrfs_endio_direct_write_update_ordered(struct inode *inode, + const u64 offset, + const u64 bytes, + const int uptodate) { - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered = NULL; - u64 ordered_offset = dip->logical_offset; - u64 ordered_bytes = dip->bytes; - struct bio *dio_bio; + u64 ordered_offset = offset; + u64 ordered_bytes = bytes; int ret; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, - !bio->bi_error); + uptodate); if (!ret) goto out_test; @@ -8021,13 +8011,22 @@ out_test: * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again */ - if (ordered_offset < dip->logical_offset + dip->bytes) { - ordered_bytes = dip->logical_offset + dip->bytes - - ordered_offset; + if (ordered_offset < offset + bytes) { + ordered_bytes = offset + bytes - ordered_offset; ordered = NULL; goto again; } - dio_bio = dip->dio_bio; +} + +static void btrfs_endio_direct_write(struct bio *bio) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct bio *dio_bio = dip->dio_bio; + + btrfs_endio_direct_write_update_ordered(dip->inode, + dip->logical_offset, + dip->bytes, + !bio->bi_error); kfree(dip); @@ -8335,6 +8334,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->subio_endio = btrfs_subio_endio_read; } + /* + * Reset the range for unsubmitted ordered extents (to a 0 length range) + * even if we fail to submit a bio, because in such case we do the + * corresponding error handling below and it must not be done a second + * time by btrfs_direct_IO(). + */ + if (write) { + struct btrfs_dio_data *dio_data = current->journal_info; + + dio_data->unsubmitted_oe_range_end = dip->logical_offset + + dip->bytes; + dio_data->unsubmitted_oe_range_start = + dio_data->unsubmitted_oe_range_end; + } + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; @@ -8363,24 +8377,15 @@ free_ordered: dip = NULL; io_bio = NULL; } else { - if (write) { - struct btrfs_ordered_extent *ordered; - - ordered = btrfs_lookup_ordered_extent(inode, - file_offset); - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); - /* - * Decrements our ref on the ordered extent and removes - * the ordered extent from the inode's ordered tree, - * doing all the proper resource cleanup such as for the - * reserved space and waking up any waiters for this - * ordered extent (through btrfs_remove_ordered_extent). - */ - btrfs_finish_ordered_io(ordered); - } else { + if (write) + btrfs_endio_direct_write_update_ordered(inode, + file_offset, + dio_bio->bi_iter.bi_size, + 0); + else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - } + dio_bio->bi_error = -EIO; /* * Releases and cleans up our dio_bio, no need to bio_put() @@ -8480,6 +8485,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * originally calculated. Abuse current->journal_info for this. */ dio_data.reserve = round_up(count, root->sectorsize); + dio_data.unsubmitted_oe_range_start = (u64)offset; + dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { @@ -8498,6 +8505,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (dio_data.reserve) btrfs_delalloc_release_space(inode, offset, dio_data.reserve); + /* + * On error we might have left some ordered extents + * without submitting corresponding bios for them, so + * cleanup them up to avoid other tasks getting them + * and waiting for them to complete forever. + */ + if (dio_data.unsubmitted_oe_range_start < + dio_data.unsubmitted_oe_range_end) + btrfs_endio_direct_write_update_ordered(inode, + dio_data.unsubmitted_oe_range_start, + dio_data.unsubmitted_oe_range_end - + dio_data.unsubmitted_oe_range_start, + 0); } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); @@ -8535,15 +8555,28 @@ int btrfs_readpage(struct file *file, struct page *page) static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; - + struct inode *inode = page->mapping->host; + int ret; if (current->flags & PF_MEMALLOC) { redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } + + /* + * If we are under memory pressure we will call this directly from the + * VM, we need to make sure we have the inode referenced for the ordered + * extent. If not just return like we didn't do anything. + */ + if (!igrab(inode)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_write_full_page(tree, page, btrfs_get_extent, wbc); + ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); + btrfs_add_delayed_iput(inode); + return ret; } static int btrfs_writepages(struct address_space *mapping, @@ -8615,7 +8648,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, } if (!inode_evicting) - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(tree, page_start, page_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { /* @@ -8653,7 +8686,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_put_ordered_extent(ordered); if (!inode_evicting) { cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, + lock_extent_bits(tree, page_start, page_end, &cached_state); } } @@ -8751,7 +8784,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, page_start, page_end, &cached_state); set_page_extent_mapped(page); /* @@ -9025,6 +9058,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; + ei->delayed_iput_count = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; @@ -9049,6 +9083,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); + INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); return inode; @@ -9153,15 +9188,14 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); if (btrfs_free_space_cachep) kmem_cache_destroy(btrfs_free_space_cachep); - if (btrfs_delalloc_work_cachep) - kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + init_once); if (!btrfs_inode_cachep) goto fail; @@ -9189,13 +9223,6 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; - btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", - sizeof(struct btrfs_delalloc_work), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - NULL); - if (!btrfs_delalloc_work_cachep) - goto fail; - return 0; fail: btrfs_destroy_cachep(); @@ -9419,14 +9446,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) delalloc_work = container_of(work, struct btrfs_delalloc_work, work); inode = delalloc_work->inode; - if (delalloc_work->wait) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - } else { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); - } if (delalloc_work->delay_iput) btrfs_add_delayed_iput(inode); @@ -9436,18 +9459,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) } struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int wait, int delay_iput) + int delay_iput) { struct btrfs_delalloc_work *work; - work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + work = kmalloc(sizeof(*work), GFP_NOFS); if (!work) return NULL; init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - work->wait = wait; work->delay_iput = delay_iput; WARN_ON_ONCE(!inode); btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, @@ -9459,7 +9481,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) { wait_for_completion(&work->completion); - kmem_cache_free(btrfs_delalloc_work_cachep, work); + kfree(work); } /* @@ -9495,7 +9517,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, } spin_unlock(&root->delalloc_lock); - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + work = btrfs_alloc_delalloc_work(inode, delay_iput); if (!work) { if (delay_iput) btrfs_add_delayed_iput(inode); @@ -9637,9 +9659,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, /* * 2 items for inode item and ref * 2 items for dir items + * 1 item for updating parent inode item + * 1 item for the inline extent item * 1 item for xattr if selinux is on */ - trans = btrfs_start_transaction(root, 5); + trans = btrfs_start_transaction(root, 7); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -9670,10 +9694,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock_inode; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); - if (err) - goto out_unlock_inode; - path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -9711,6 +9731,13 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); + /* + * Last step, add directory indexes for our symlink inode. This is the + * last step to avoid extra cleanup of these indexes if an error happens + * elsewhere above. + */ + if (!err) + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) { drop_inode = 1; goto out_unlock_inode; @@ -9761,7 +9788,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* * If we are severely fragmented we could end up with really @@ -10025,7 +10052,7 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static struct extent_io_ops btrfs_extent_io_ops = { +static const struct extent_io_ops btrfs_extent_io_ops = { .fill_delalloc = run_delalloc_range, .submit_bio_hook = btrfs_submit_bio_hook, .merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e21997385d14..2a47a3148ec8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -655,22 +655,28 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) + return -ENOMEM; + + pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), + GFP_NOFS); + pending_snapshot->path = btrfs_alloc_path(); + if (!pending_snapshot->root_item || !pending_snapshot->path) { + ret = -ENOMEM; + goto free_pending; + } + atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); btrfs_wait_for_no_snapshoting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) - goto out; + goto dec_and_free; btrfs_wait_ordered_extents(root, -1); - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) { - ret = -ENOMEM; - goto out; - } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -686,7 +692,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, &pending_snapshot->qgroup_reserved, false); if (ret) - goto free; + goto dec_and_free; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -737,11 +743,14 @@ fail: btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, pending_snapshot->qgroup_reserved); -free: - kfree(pending_snapshot); -out: +dec_and_free: if (atomic_dec_and_test(&root->will_be_snapshoted)) wake_up_atomic_t(&root->will_be_snapshoted); +free_pending: + kfree(pending_snapshot->root_item); + btrfs_free_path(pending_snapshot->path); + kfree(pending_snapshot); + return ret; } @@ -992,7 +1001,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) u64 end = start + len - 1; /* get the big lock and read metadata off disk */ - lock_extent_bits(io_tree, start, end, 0, &cached); + lock_extent_bits(io_tree, start, end, &cached); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); @@ -1016,7 +1025,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) ret = false; else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > 128 * 1024 && next->block_len > 128 * 1024)) + (em->block_len > SZ_128K && next->block_len > SZ_128K)) ret = false; free_extent_map(next); @@ -1140,7 +1149,7 @@ again: page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { lock_extent_bits(tree, page_start, page_end, - 0, &cached_state); + &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); unlock_extent_cached(tree, page_start, page_end, @@ -1200,7 +1209,7 @@ again: page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, 0, &cached_state); + page_start, page_end - 1, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, @@ -1262,9 +1271,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT; unsigned long cluster = max_cluster; - u64 new_align = ~((u64)128 * 1024 - 1); + u64 new_align = ~((u64)SZ_128K - 1); struct page **pages = NULL; if (isize == 0) @@ -1281,7 +1290,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } if (extent_thresh == 0) - extent_thresh = 256 * 1024; + extent_thresh = SZ_256K; /* * if we were not given a file, allocate a readahead @@ -1313,7 +1322,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (newer_than) { ret = find_new_extents(root, inode, newer_than, - &newer_off, 64 * 1024); + &newer_off, SZ_64K); if (!ret) { range->start = newer_off; /* @@ -1403,9 +1412,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, newer_off = max(newer_off + 1, (u64)i << PAGE_CACHE_SHIFT); - ret = find_new_extents(root, inode, - newer_than, &newer_off, - 64 * 1024); + ret = find_new_extents(root, inode, newer_than, + &newer_off, SZ_64K); if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; @@ -1571,7 +1579,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = old_size + new_size; } - if (new_size < 256 * 1024 * 1024) { + if (new_size < SZ_256M) { ret = -EINVAL; goto out_free; } @@ -2160,7 +2168,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, struct inode *inode; int ret; size_t buf_size; - const size_t buf_limit = 16 * 1024 * 1024; + const size_t buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3096,7 +3104,7 @@ out_unlock: return ret; } -#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) +#define BTRFS_MAX_DEDUPE_LEN SZ_16M ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, struct file *dst_file, u64 dst_loff) @@ -3396,7 +3404,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, return ret; } - path->reada = 2; + path->reada = READA_FORWARD; /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; @@ -4039,7 +4047,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) return -ENOMEM; space_args.total_spaces = 0; - dest = kmalloc(alloc_size, GFP_NOFS); + dest = kmalloc(alloc_size, GFP_KERNEL); if (!dest) return -ENOMEM; dest_orig = dest; @@ -4416,7 +4424,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - size = min_t(u32, loi->size, 64 * 1024); + size = min_t(u32, loi->size, SZ_64K); inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); @@ -4565,7 +4573,7 @@ locked: goto out_bargs; } - bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; goto out_bargs; @@ -4651,7 +4659,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root, goto out; } - bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + bargs = kzalloc(sizeof(*bargs), GFP_KERNEL); if (!bargs) { ret = -ENOMEM; goto out; @@ -4911,7 +4919,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - qsa = kzalloc(sizeof(*qsa), GFP_NOFS); + qsa = kzalloc(sizeof(*qsa), GFP_KERNEL); if (!qsa) return -ENOMEM; @@ -5041,7 +5049,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, goto out; } - args64 = kmalloc(sizeof(*args64), GFP_NOFS); + args64 = kmalloc(sizeof(*args64), GFP_KERNEL); if (!args64) { ret = -ENOMEM; goto out; @@ -5178,7 +5186,7 @@ out_unlock: static int btrfs_ioctl_get_supported_features(struct file *file, void __user *arg) { - static struct btrfs_ioctl_feature_flags features[3] = { + static const struct btrfs_ioctl_feature_flags features[3] = { INIT_FEATURE_FLAGS(SUPP), INIT_FEATURE_FLAGS(SAFE_SET), INIT_FEATURE_FLAGS(SAFE_CLEAR) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 8077461fc56a..d13128c70ddd 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -56,7 +56,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) atomic_dec(&eb->spinning_readers); read_unlock(&eb->lock); } - return; } /* @@ -96,7 +95,6 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); } - return; } /* diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1a33d3eb36de..6d707545f775 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -503,7 +503,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) } spin_unlock_irqrestore(&table->cache_lock, flags); - return; } /* @@ -906,7 +905,6 @@ static void raid_write_end_io(struct bio *bio) err = -EIO; rbio_orig_end_io(rbio, err); - return; } /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b4ca5454ef1a..ef6d8fc85853 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -708,8 +708,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } - path1->reada = 1; - path2->reada = 2; + path1->reada = READA_FORWARD; + path2->reada = READA_FORWARD; node = alloc_backref_node(cache); if (!node) { @@ -2130,7 +2130,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -3527,7 +3527,7 @@ static int find_data_references(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { @@ -3917,7 +3917,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; ret = prepare_to_relocate(rc); if (ret) { @@ -4343,7 +4343,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b091d94ceef6..0c981ebe2acb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1514,8 +1514,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); - - return; } static inline int scrub_check_fsid(u8 fsid[], @@ -3507,7 +3505,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; @@ -3735,27 +3733,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + btrfs_alloc_workqueue("scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + btrfs_alloc_workqueue("scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("btrfs-scrubparity", flags, + btrfs_alloc_workqueue("scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; @@ -4211,7 +4209,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len, io_tree = &BTRFS_I(inode)->io_tree; - lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + lock_extent_bits(io_tree, lockstart, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, lockstart, len); if (ordered) { btrfs_put_ordered_extent(ordered); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 355a458cba1a..63a6152be04b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1469,7 +1469,21 @@ static int read_symlink(struct btrfs_root *root, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret); + if (ret) { + /* + * An empty symlink inode. Can happen in rare error paths when + * creating a symlink (transaction committed before the inode + * eviction handler removed the symlink inode items and a crash + * happened in between or the subvol was snapshoted in between). + * Print an informative message to dmesg/syslog so that the user + * can delete the symlink. + */ + btrfs_err(root->fs_info, + "Found empty symlink inode %llu at root %llu", + ino, root->root_key.objectid); + ret = -EIO; + goto out; + } ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 48d425aef05b..02e00166c4da 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -22,8 +22,8 @@ #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" #define BTRFS_SEND_STREAM_VERSION 1 -#define BTRFS_SEND_BUF_SIZE (1024 * 64) -#define BTRFS_SEND_READ_SIZE (1024 * 48) +#define BTRFS_SEND_BUF_SIZE SZ_64K +#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a0434c179ea9..9b9eab6d048e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -295,10 +295,11 @@ enum { Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, - Opt_no_space_cache, Opt_recovery, Opt_skip_balance, - Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_space_cache, Opt_space_cache_version, Opt_clear_cache, + Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, + Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, + Opt_skip_balance, Opt_check_integrity, + Opt_check_integrity_including_extent_data, Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, @@ -309,7 +310,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, {Opt_subvolid, "subvolid=%s"}, @@ -340,6 +341,7 @@ static match_table_t tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_space_cache, "space_cache"}, + {Opt_space_cache_version, "space_cache=%s"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_enospc_debug, "enospc_debug"}, @@ -383,7 +385,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) bool compress_force = false; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (cache_gen) + if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); + else if (cache_gen) btrfs_set_opt(info->mount_opt, SPACE_CACHE); if (!options) @@ -617,15 +621,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) "turning off discard"); break; case Opt_space_cache: - btrfs_set_and_info(root, SPACE_CACHE, - "enabling disk space caching"); + case Opt_space_cache_version: + if (token == Opt_space_cache || + strcmp(args[0].from, "v1") == 0) { + btrfs_clear_opt(root->fs_info->mount_opt, + FREE_SPACE_TREE); + btrfs_set_and_info(root, SPACE_CACHE, + "enabling disk space caching"); + } else if (strcmp(args[0].from, "v2") == 0) { + btrfs_clear_opt(root->fs_info->mount_opt, + SPACE_CACHE); + btrfs_set_and_info(root, FREE_SPACE_TREE, + "enabling free space tree"); + } else { + ret = -EINVAL; + goto out; + } break; case Opt_rescan_uuid_tree: btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - btrfs_clear_and_info(root, SPACE_CACHE, - "disabling disk space caching"); + if (btrfs_test_opt(root, SPACE_CACHE)) { + btrfs_clear_and_info(root, SPACE_CACHE, + "disabling disk space caching"); + } + if (btrfs_test_opt(root, FREE_SPACE_TREE)) { + btrfs_clear_and_info(root, FREE_SPACE_TREE, + "disabling free space tree"); + } break; case Opt_inode_cache: btrfs_set_pending_and_info(info, INODE_MAP_CACHE, @@ -754,8 +778,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && + !btrfs_test_opt(root, FREE_SPACE_TREE) && + !btrfs_test_opt(root, CLEAR_CACHE)) { + btrfs_err(root->fs_info, "cannot disable free space tree"); + ret = -EINVAL; + + } if (!ret && btrfs_test_opt(root, SPACE_CACHE)) btrfs_info(root->fs_info, "disk space caching is enabled"); + if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE)) + btrfs_info(root->fs_info, "using free space tree"); kfree(orig); return ret; } @@ -1162,6 +1195,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",noacl"); if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); + else if (btrfs_test_opt(root, FREE_SPACE_TREE)) + seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, RESCAN_UUID_TREE)) @@ -1863,7 +1898,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * btrfs starts at an offset of at least 1MB when doing chunk * allocation. */ - skip_space = 1024 * 1024; + skip_space = SZ_1M; /* user can set the offset in fs_info->alloc_start. */ if (fs_info->alloc_start && @@ -1954,6 +1989,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * there are other factors that may change the result (like a new metadata * chunk). * + * If metadata is exhausted, f_bavail will be 0. + * * FIXME: not accurate for mixed block groups, total and free/used are ok, * available appears slightly larger. */ @@ -1965,11 +2002,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; + u64 total_free_meta = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)fs_info->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; + u64 thresh = 0; /* * holding chunk_muext to avoid allocating new chunks, holding @@ -1995,6 +2034,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + total_free_meta += found->disk_total - found->disk_used; total_used += found->disk_used; } @@ -2017,6 +2058,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; + /* + * We calculate the remaining metadata space minus global reserve. If + * this is (supposedly) smaller than zero, there's no space. But this + * does not hold in practice, the exhausted state happens where's still + * some positive delta. So we apply some guesswork and compare the + * delta to a 4M threshold. (Practically observed delta was ~2M.) + * + * We probably cannot calculate the exact threshold value because this + * depends on the internal reservations requested by various + * operations, so some operations that consume a few metadata will + * succeed even if the Avail is zero. But this is better than the other + * way around. + */ + thresh = 4 * 1024 * 1024; + + if (total_free_meta - thresh < block_rsv->size) + buf->f_bavail = 0; + buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_namelen = BTRFS_NAME_LEN; @@ -2223,6 +2282,9 @@ static int btrfs_run_sanity_tests(void) if (ret) goto out; ret = btrfs_test_qgroups(); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(); out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 9626252ee6b4..b1d920b30070 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -21,6 +21,9 @@ #include <linux/magic.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../free-space-cache.h" +#include "../free-space-tree.h" +#include "../transaction.h" #include "../volumes.h" #include "../disk-io.h" #include "../qgroup.h" @@ -122,6 +125,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + extent_io_tree_init(&fs_info->freed_extents[0], NULL); + extent_io_tree_init(&fs_info->freed_extents[1], NULL); + fs_info->pinned_extents = &fs_info->freed_extents[0]; return fs_info; } @@ -169,3 +175,55 @@ void btrfs_free_dummy_root(struct btrfs_root *root) kfree(root); } +struct btrfs_block_group_cache * +btrfs_alloc_dummy_block_group(unsigned long length) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + cache->fs_info = btrfs_alloc_dummy_fs_info(); + if (!cache->fs_info) { + kfree(cache->free_space_ctl); + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = length; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + cache->full_stripe_len = 4096; + + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + btrfs_init_free_space_ctl(cache); + mutex_init(&cache->free_space_lock); + + return cache; +} + +void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache) +{ + if (!cache) + return; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); +} + +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index fd3954224480..054b8c73c951 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -24,17 +24,23 @@ #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) struct btrfs_root; +struct btrfs_trans_handle; int btrfs_test_free_space_cache(void); int btrfs_test_extent_buffer_operations(void); int btrfs_test_extent_io(void); int btrfs_test_inodes(void); int btrfs_test_qgroups(void); +int btrfs_test_free_space_tree(void); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); void btrfs_free_dummy_root(struct btrfs_root *root); +struct btrfs_block_group_cache * +btrfs_alloc_dummy_block_group(unsigned long length); +void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else static inline int btrfs_test_free_space_cache(void) { @@ -63,6 +69,10 @@ static inline int btrfs_test_qgroups(void) { return 0; } +static inline int btrfs_test_free_space_tree(void) +{ + return 0; +} #endif #endif diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e9f2368177d..e29fa297e053 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/slab.h> +#include <linux/sizes.h> #include "btrfs-tests.h" #include "../extent_io.h" @@ -70,12 +72,14 @@ static int test_find_delalloc(void) struct page *page; struct page *locked_page = NULL; unsigned long index = 0; - u64 total_dirty = 256 * 1024 * 1024; - u64 max_bytes = 128 * 1024 * 1024; + u64 total_dirty = SZ_256M; + u64 max_bytes = SZ_128M; u64 start, end, test_start; u64 found; int ret = -EINVAL; + test_msg("Running find delalloc tests\n"); + inode = btrfs_new_test_inode(); if (!inode) { test_msg("Failed to allocate test inode\n"); @@ -133,7 +137,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - test_start = 64 * 1024 * 1024; + test_start = SZ_64M; locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_CACHE_SHIFT); if (!locked_page) { @@ -220,8 +224,8 @@ static int test_find_delalloc(void) * Now to test where we run into a page that is no longer dirty in the * range we want to find. */ - page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024)) - >> PAGE_CACHE_SHIFT); + page = find_get_page(inode->i_mapping, + (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT); if (!page) { test_msg("Couldn't find our page\n"); goto out_bits; @@ -268,8 +272,139 @@ out: return ret; } +static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, + unsigned long len) +{ + unsigned long i, x; + + memset(bitmap, 0, len); + memset_extent_buffer(eb, 0, 0, len); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Bitmap was not zeroed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting all bits failed\n"); + return -EINVAL; + } + + bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing all bits failed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting straddling pages failed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + bitmap_clear(bitmap, + (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing straddling pages failed\n"); + return -EINVAL; + } + + /* + * Generate a wonky pseudo-random bit pattern for the sake of not using + * something repetitive that could miss some hypothetical off-by-n bug. + */ + x = 0; + for (i = 0; i < len / sizeof(long); i++) { + x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL; + bitmap[i] = x; + } + write_extent_buffer(eb, bitmap, 0, len); + + for (i = 0; i < len * BITS_PER_BYTE; i++) { + int bit, bit1; + + bit = !!test_bit(i, bitmap); + bit1 = !!extent_buffer_test_bit(eb, 0, i); + if (bit1 != bit) { + test_msg("Testing bit pattern failed\n"); + return -EINVAL; + } + + bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1 != bit) { + test_msg("Testing bit pattern with offset failed\n"); + return -EINVAL; + } + } + + return 0; +} + +static int test_eb_bitmaps(void) +{ + unsigned long len = PAGE_CACHE_SIZE * 4; + unsigned long *bitmap; + struct extent_buffer *eb; + int ret; + + test_msg("Running extent buffer bitmap tests\n"); + + bitmap = kmalloc(len, GFP_NOFS); + if (!bitmap) { + test_msg("Couldn't allocate test bitmap\n"); + return -ENOMEM; + } + + eb = __alloc_dummy_extent_buffer(NULL, 0, len); + if (!eb) { + test_msg("Couldn't allocate test extent buffer\n"); + kfree(bitmap); + return -ENOMEM; + } + + ret = __test_eb_bitmaps(bitmap, eb, len); + if (ret) + goto out; + + /* Do it over again with an extent buffer which isn't page-aligned. */ + free_extent_buffer(eb); + eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len); + if (!eb) { + test_msg("Couldn't allocate test extent buffer\n"); + kfree(bitmap); + return -ENOMEM; + } + + ret = __test_eb_bitmaps(bitmap, eb, len); +out: + free_extent_buffer(eb); + kfree(bitmap); + return ret; +} + int btrfs_test_extent_io(void) { - test_msg("Running find delalloc tests\n"); - return test_find_delalloc(); + int ret; + + test_msg("Running extent I/O tests\n"); + + ret = test_find_delalloc(); + if (ret) + goto out; + + ret = test_eb_bitmaps(); +out: + test_msg("Extent I/O tests finished\n"); + return ret; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 8b72b005bfb9..c9ad97b1e690 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -23,41 +23,6 @@ #include "../free-space-cache.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) -static struct btrfs_block_group_cache *init_test_block_group(void) -{ - struct btrfs_block_group_cache *cache; - - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return NULL; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return NULL; - } - cache->fs_info = btrfs_alloc_dummy_fs_info(); - if (!cache->fs_info) { - kfree(cache->free_space_ctl); - kfree(cache); - return NULL; - } - - cache->key.objectid = 0; - cache->key.offset = 1024 * 1024 * 1024; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; - cache->full_stripe_len = 4096; - - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->bg_list); - - btrfs_init_free_space_ctl(cache); - - return cache; -} /* * This test just does basic sanity checking, making sure we can add an exten @@ -71,59 +36,59 @@ static int test_extents(struct btrfs_block_group_cache *cache) test_msg("Running extent only tests\n"); /* First just make sure we can remove an entire entry */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error adding initial extents %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error removing extent %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_4M)) { test_msg("Full remove left some lingering space\n"); return -1; } /* Ok edge and middle cases now */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error adding half extent %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M); if (ret) { test_msg("Error removing tail end %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { test_msg("Error removing front end %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + ret = btrfs_remove_free_space(cache, SZ_2M, 4096); if (ret) { test_msg("Error removing middle piece %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_1M)) { test_msg("Still have space at the front\n"); return -1; } - if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) { + if (test_check_exists(cache, SZ_2M, 4096)) { test_msg("Still have space in the middle\n"); return -1; } - if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) { test_msg("Still have space at the end\n"); return -1; } @@ -141,30 +106,30 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) test_msg("Running bitmap only tests\n"); - ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { test_msg("Couldn't create a bitmap entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error removing bitmap full range %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_4M)) { test_msg("Left some space in bitmap\n"); return -1; } - ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { test_msg("Couldn't add to our bitmap entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M); if (ret) { test_msg("Couldn't remove middle chunk %d\n", ret); return ret; @@ -177,23 +142,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); /* Test a bit straddling two bitmaps */ - ret = test_add_free_space_entry(cache, next_bitmap_offset - - (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, + SZ_4M, 1); if (ret) { test_msg("Couldn't add space that straddles two bitmaps %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, next_bitmap_offset - - (1 * 1024 * 1024), 2 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M); if (ret) { test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), - 2 * 1024 * 1024)) { + if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) { test_msg("Left some space when removing overlapping\n"); return -1; } @@ -216,43 +179,43 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * bitmap, but the free space completely in the extent and then * completely in the bitmap. */ - ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1); if (ret) { test_msg("Couldn't create bitmap entry %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { test_msg("Couldn't remove extent entry %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_1M)) { test_msg("Left remnants after our remove\n"); return -1; } /* Now to add back the extent entry and remove from the bitmap */ - ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { test_msg("Couldn't re-add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M); if (ret) { test_msg("Couldn't remove from bitmap %d\n", ret); return ret; } - if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + if (test_check_exists(cache, SZ_4M, SZ_1M)) { test_msg("Left remnants in the bitmap\n"); return -1; } @@ -261,19 +224,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * Ok so a little more evil, extent entry and bitmap at the same offset, * removing an overlapping chunk. */ - ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1); if (ret) { test_msg("Couldn't add to a bitmap %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M); if (ret) { test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) { test_msg("Left over pieces after removing overlapping\n"); return -1; } @@ -281,25 +244,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) __btrfs_remove_free_space_cache(cache->free_space_ctl); /* Now with the extent entry offset into the bitmap */ - ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); if (ret) { test_msg("Couldn't add space to the bitmap %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0); if (ret) { test_msg("Couldn't add extent to the cache %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M); if (ret) { test_msg("Problem removing overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) { test_msg("Left something behind when removing space"); return -1; } @@ -315,29 +278,26 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * [ del ] */ __btrfs_remove_free_space_cache(cache->free_space_ctl); - ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, - 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); if (ret) { test_msg("Couldn't add bitmap %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, - 5 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M, + 5 * SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M); if (ret) { test_msg("Failed to free our space %d\n", ret); return ret; } - if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024)) { + if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) { test_msg("Left stuff over\n"); return -1; } @@ -350,19 +310,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * to return -EAGAIN back from btrfs_remove_extent, make sure this * doesn't happen. */ - ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M); if (ret) { test_msg("Error removing bitmap and extent overlapping %d\n", ret); return ret; @@ -445,9 +405,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int ret; u64 offset; u64 max_extent_size; - - bool (*use_bitmap_op)(struct btrfs_free_space_ctl *, - struct btrfs_free_space *); + const struct btrfs_free_space_op test_free_space_ops = { + .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds, + .use_bitmap = test_use_bitmap, + }; + const struct btrfs_free_space_op *orig_free_space_ops; test_msg("Running space stealing from bitmap to extent\n"); @@ -469,22 +431,21 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that forces use of bitmaps as soon as we have at least 1 * extent entry. */ - use_bitmap_op = cache->free_space_ctl->op->use_bitmap; - cache->free_space_ctl->op->use_bitmap = test_use_bitmap; + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; /* * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024, - 128 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024, - 128 * 1024 * 1024 - 512 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K, + SZ_128M - SZ_512K, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; @@ -502,21 +463,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * [128Mb + 512Kb, 128Mb + 768Kb[ */ ret = btrfs_remove_free_space(cache, - 128 * 1024 * 1024 + 768 * 1024, - 128 * 1024 * 1024 - 768 * 1024); + SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K); if (ret) { test_msg("Failed to free part of bitmap space %d\n", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) { test_msg("Free space range missing\n"); return -ENOENT; } - if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024, - 256 * 1024)) { + if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) { test_msg("Free space range missing\n"); return -ENOENT; } @@ -525,8 +484,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked * as free anymore. */ - if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024, - 128 * 1024 * 1024 - 768 * 1024)) { + if (test_check_exists(cache, SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K)) { test_msg("Bitmap region not removed from space cache\n"); return -EINVAL; } @@ -535,8 +494,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is * covered by the bitmap, isn't marked as free. */ - if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024, - 256 * 1024)) { + if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -545,8 +503,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered * by the bitmap too, isn't marked as free either. */ - if (test_check_exists(cache, 128 * 1024 * 1024, - 256 * 1024)) { + if (test_check_exists(cache, SZ_128M, SZ_256K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -556,13 +513,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * lets make sure the free space cache marks it as free in the bitmap, * and doesn't insert a new extent entry to represent this region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) { + if (!test_check_exists(cache, SZ_128M, SZ_512K)) { test_msg("Bitmap region not marked as free\n"); return -ENOENT; } @@ -581,8 +538,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024, - 4096); + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -601,15 +557,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * expand the range covered by the existing extent entry that represents * the free space [128Mb - 256Kb, 128Mb - 128Kb[. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024, - 128 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) { test_msg("Extent region not marked as free\n"); return -ENOENT; } @@ -637,21 +591,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that represents the 1Mb free space, and therefore we're able to * allocate the whole free space at once. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, - 1 * 1024 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) { test_msg("Expected region not marked as free\n"); return -ENOENT; } - if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) { + if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) { test_msg("Cache free space is not 1Mb + 4Kb\n"); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 1 * 1024 * 1024, 0, + 0, SZ_1M, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 - 256 * 1024)) { + if (offset != (SZ_128M - SZ_256K)) { test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -670,7 +623,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) { + if (offset != (SZ_128M + SZ_16M)) { test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -691,16 +644,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) /* * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024, - 128 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ - ret = test_add_free_space_entry(cache, 0, - 128 * 1024 * 1024 - 512 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; @@ -717,22 +668,18 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * [128Mb + 128b, 128Mb + 256Kb[ * [128Mb - 768Kb, 128Mb - 512Kb[ */ - ret = btrfs_remove_free_space(cache, - 0, - 128 * 1024 * 1024 - 768 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K); if (ret) { test_msg("Failed to free part of bitmap space %d\n", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) { test_msg("Free space range missing\n"); return -ENOENT; } - if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, - 256 * 1024)) { + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) { test_msg("Free space range missing\n"); return -ENOENT; } @@ -741,8 +688,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked * as free anymore. */ - if (test_check_exists(cache, 0, - 128 * 1024 * 1024 - 768 * 1024)) { + if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) { test_msg("Bitmap region not removed from space cache\n"); return -EINVAL; } @@ -751,8 +697,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb - 512Kb, 128Mb[, which is * covered by the bitmap, isn't marked as free. */ - if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024)) { + if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -762,15 +707,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * lets make sure the free space cache marks it as free in the bitmap, * and doesn't insert a new extent entry to represent this region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { test_msg("Bitmap region not marked as free\n"); return -ENOENT; } @@ -789,7 +732,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192); + ret = btrfs_add_free_space(cache, SZ_32M, 8192); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -800,13 +743,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * expand the range covered by the existing extent entry that represents * the free space [128Mb + 128Kb, 128Mb + 256Kb[. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M, SZ_128K)) { test_msg("Extent region not marked as free\n"); return -ENOENT; } @@ -834,21 +777,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that represents the 1Mb free space, and therefore we're able to * allocate the whole free space at once. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, - 1 * 1024 * 1024)) { + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) { test_msg("Expected region not marked as free\n"); return -ENOENT; } - if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) { + if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) { test_msg("Cache free space is not 1Mb + 8Kb\n"); return -EINVAL; } - offset = btrfs_find_space_for_alloc(cache, - 0, 1 * 1024 * 1024, 0, + offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 - 768 * 1024)) { + if (offset != (SZ_128M - 768 * SZ_1K)) { test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -867,7 +808,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 8192, 0, &max_extent_size); - if (offset != (32 * 1024 * 1024)) { + if (offset != SZ_32M) { test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -877,7 +818,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) if (ret) return ret; - cache->free_space_ctl->op->use_bitmap = use_bitmap_op; + cache->free_space_ctl->op = orig_free_space_ops; __btrfs_remove_free_space_cache(cache->free_space_ctl); return 0; @@ -891,7 +832,7 @@ int btrfs_test_free_space_cache(void) test_msg("Running btrfs free space cache tests\n"); - cache = init_test_block_group(); + cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024); if (!cache) { test_msg("Couldn't run the tests\n"); return 0; @@ -922,9 +863,7 @@ int btrfs_test_free_space_cache(void) ret = test_steal_space_from_bitmap_to_extent(cache); out: - __btrfs_remove_free_space_cache(cache->free_space_ctl); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); test_msg("Free space cache tests finished\n"); return ret; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c new file mode 100644 index 000000000000..d05fe1ab4808 --- /dev/null +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -0,0 +1,571 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../disk-io.h" +#include "../free-space-tree.h" +#include "../transaction.h" + +struct free_space_extent { + u64 start, length; +}; + +/* + * The test cases align their operations to this in order to hit some of the + * edge cases in the bitmap code. + */ +#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096) + +static int __check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path, + struct free_space_extent *extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + struct btrfs_key key; + int prev_bit = 0, bit; + u64 extent_start = 0, offset, end; + u32 flags, extent_count; + unsigned int i; + int ret; + + info = search_free_space_info(trans, fs_info, cache, path, 0); + if (IS_ERR(info)) { + test_msg("Could not find free space info\n"); + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + if (extent_count != num_extents) { + test_msg("Extent count is wrong\n"); + ret = -EINVAL; + goto out; + } + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + if (path->slots[0] != 0) + goto invalid; + end = cache->key.objectid + cache->key.offset; + i = 0; + while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY) + goto invalid; + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(cache, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + if (i >= num_extents) + goto invalid; + if (i >= num_extents || + extent_start != extents[i].start || + offset - extent_start != extents[i].length) + goto invalid; + i++; + } + prev_bit = bit; + offset += cache->sectorsize; + } + } + if (prev_bit == 1) { + if (i >= num_extents || + extent_start != extents[i].start || + end - extent_start != extents[i].length) + goto invalid; + i++; + } + if (i != num_extents) + goto invalid; + } else { + if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 || + path->slots[0] != 0) + goto invalid; + for (i = 0; i < num_extents; i++) { + path->slots[0]++; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY || + key.objectid != extents[i].start || + key.offset != extents[i].length) + goto invalid; + } + } + + ret = 0; +out: + btrfs_release_path(path); + return ret; +invalid: + test_msg("Free space tree is invalid\n"); + ret = -EINVAL; + goto out; +} + +static int check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path, + struct free_space_extent *extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + info = search_free_space_info(trans, fs_info, cache, path, 0); + if (IS_ERR(info)) { + test_msg("Could not find free space info\n"); + btrfs_release_path(path); + return PTR_ERR(info); + } + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + ret = __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); + if (ret) + return ret; + + /* Flip it to the other format and check that for good measure. */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + ret = convert_free_space_to_extents(trans, fs_info, cache, path); + if (ret) { + test_msg("Could not convert to extents\n"); + return ret; + } + } else { + ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path); + if (ret) { + test_msg("Could not convert to bitmaps\n"); + return ret; + } + } + return __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); +} + +static int test_empty_block_group(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset}, + }; + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_all(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = {}; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_beginning(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid + BITMAP_RANGE, + cache->key.offset - BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); + +} + +static int test_remove_end(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset - BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + + cache->key.offset - BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_middle(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, BITMAP_RANGE}, + {cache->key.objectid + 2 * BITMAP_RANGE, + cache->key.offset - 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_left(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_right(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_both(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, 3 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_none(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, BITMAP_RANGE}, + {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE}, + {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 4 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +typedef int (*test_func_t)(struct btrfs_trans_handle *, + struct btrfs_fs_info *, + struct btrfs_block_group_cache *, + struct btrfs_path *); + +static int run_test(test_func_t test_func, int bitmaps) +{ + struct btrfs_root *root = NULL; + struct btrfs_block_group_cache *cache = NULL; + struct btrfs_trans_handle trans; + struct btrfs_path *path = NULL; + int ret; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate dummy root\n"); + ret = PTR_ERR(root); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + + btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE); + root->fs_info->free_space_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 8192; + + cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE); + if (!cache) { + test_msg("Couldn't allocate dummy block group cache\n"); + ret = -ENOMEM; + goto out; + } + cache->bitmap_low_thresh = 0; + cache->bitmap_high_thresh = (u32)-1; + cache->needs_free_space = 1; + + btrfs_init_dummy_trans(&trans); + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + ret = add_block_group_free_space(&trans, root->fs_info, cache); + if (ret) { + test_msg("Could not add block group free space\n"); + goto out; + } + + if (bitmaps) { + ret = convert_free_space_to_bitmaps(&trans, root->fs_info, + cache, path); + if (ret) { + test_msg("Could not convert block group to bitmaps\n"); + goto out; + } + } + + ret = test_func(&trans, root->fs_info, cache, path); + if (ret) + goto out; + + ret = remove_block_group_free_space(&trans, root->fs_info, cache); + if (ret) { + test_msg("Could not remove block group free space\n"); + goto out; + } + + if (btrfs_header_nritems(root->node) != 0) { + test_msg("Free space tree has leftover items\n"); + ret = -EINVAL; + goto out; + } + + ret = 0; +out: + btrfs_free_path(path); + btrfs_free_dummy_block_group(cache); + btrfs_free_dummy_root(root); + return ret; +} + +static int run_test_both_formats(test_func_t test_func) +{ + int ret; + + ret = run_test(test_func, 0); + if (ret) + return ret; + return run_test(test_func, 1); +} + +int btrfs_test_free_space_tree(void) +{ + test_func_t tests[] = { + test_empty_block_group, + test_remove_all, + test_remove_beginning, + test_remove_end, + test_remove_middle, + test_merge_left, + test_merge_right, + test_merge_both, + test_merge_none, + }; + int i; + + test_msg("Running free space tree tests\n"); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + int ret = run_test_both_formats(tests[i]); + if (ret) { + test_msg("%pf failed\n", tests[i]); + return ret; + } + } + + return 0; +} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 054fc0d97131..5de55fdd28bc 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -100,7 +100,7 @@ static void insert_inode_item_key(struct btrfs_root *root) static void setup_file_extents(struct btrfs_root *root) { int slot = 0; - u64 disk_bytenr = 1 * 1024 * 1024; + u64 disk_bytenr = SZ_1M; u64 offset = 0; /* First we want a hole */ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 846d277b1901..8ea5d34bc5a2 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -23,14 +23,6 @@ #include "../qgroup.h" #include "../backref.h" -static void init_dummy_trans(struct btrfs_trans_handle *trans) -{ - memset(trans, 0, sizeof(*trans)); - trans->transid = 1; - INIT_LIST_HEAD(&trans->qgroup_ref_list); - trans->type = __TRANS_DUMMY; -} - static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) { @@ -44,7 +36,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); ins.objectid = bytenr; ins.type = BTRFS_EXTENT_ITEM_KEY; @@ -94,7 +86,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 refs; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -144,7 +136,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, struct btrfs_path *path; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -178,7 +170,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, u64 refs; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -232,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root) struct ulist *new_roots = NULL; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); ret = btrfs_create_qgroup(NULL, fs_info, 5); @@ -326,7 +318,7 @@ static int test_multiple_refs(struct btrfs_root *root) struct ulist *new_roots = NULL; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); test_msg("Qgroup multiple refs test\n"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index be8eae80ff65..b6031ce474f7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) list_del_init(&em->list); free_extent_map(em); } + /* + * If any block groups are found in ->deleted_bgs then it's + * because the transaction was aborted and a commit did not + * happen (things failed before writing the new superblock + * and calling btrfs_finish_extent_commit()), so we can not + * discard the physical locations of the block groups. + */ + while (!list_empty(&transaction->deleted_bgs)) { + struct btrfs_block_group_cache *cache; + + cache = list_first_entry(&transaction->deleted_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&cache->bg_list); + btrfs_put_block_group_trimming(cache); + btrfs_put_block_group(cache); + } kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -634,17 +651,20 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN, 0); + return start_transaction(root, 0, TRANS_JOIN, + BTRFS_RESERVE_NO_FLUSH); } struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0); + return start_transaction(root, 0, TRANS_JOIN_NOLOCK, + BTRFS_RESERVE_NO_FLUSH); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_USERSPACE, 0); + return start_transaction(root, 0, TRANS_USERSPACE, + BTRFS_RESERVE_NO_FLUSH); } /* @@ -662,7 +682,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_ATTACH, 0); + return start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH); } /* @@ -677,7 +698,8 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - trans = start_transaction(root, 0, TRANS_ATTACH, 0); + trans = start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH); if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) btrfs_wait_for_commit(root, 0); @@ -1319,17 +1341,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; uuid_le new_uuid; - path = btrfs_alloc_path(); - if (!path) { - pending->error = -ENOMEM; - return 0; - } + ASSERT(pending->path); + path = pending->path; - new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); - if (!new_root_item) { - pending->error = -ENOMEM; - goto root_item_alloc_fail; - } + ASSERT(pending->root_item); + new_root_item = pending->root_item; pending->error = btrfs_find_free_objectid(tree_root, &objectid); if (pending->error) @@ -1562,8 +1578,10 @@ clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); -root_item_alloc_fail: + pending->root_item = NULL; btrfs_free_path(path); + pending->path = NULL; + return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 64c8221b6165..72be51f7ca2f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -137,8 +137,10 @@ struct btrfs_pending_snapshot { struct dentry *dentry; struct inode *dir; struct btrfs_root *root; + struct btrfs_root_item *root_item; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; + struct btrfs_path *path; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index f31db4325339..cb65089127cc 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } btrfs_release_path(path); + /* + * We don't need a lock on a leaf. btrfs_realloc_node() will lock all + * leafs from path->nodes[1], so set lowest_level to 1 to avoid later + * a deadlock (attempting to write lock an already write locked leaf). + */ + path->lowest_level = 1; wret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (wret < 0) { @@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ret = 0; goto out; } - path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, - min_trans); + /* + * The node at level 1 must always be locked when our path has + * keep_locks set and lowest_level is 1, regardless of the value of + * path->slots[1]. + */ + BUG_ON(path->locks[1] == 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, @@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, WARN_ON(ret == -EAGAIN); goto out; } + /* + * Now that we reallocated the node we can find the next key. Note that + * btrfs_find_next_key() can release our path and do another search + * without COWing, this is because even with path->keep_locks = 1, + * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a + * node when path->slots[node_level - 1] does not point to the last + * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore + * we search for the next key after reallocating our node. + */ + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, + min_trans); if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a23399e8e3ab..c32abbca9d77 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -125,6 +125,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); +static void btrfs_close_one_device(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -1102,7 +1103,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = device->devid; key.offset = start; @@ -1257,6 +1258,15 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, int ret; int slot; struct extent_buffer *l; + u64 min_search_start; + + /* + * We don't want to overwrite the superblock on the drive nor any area + * used by the boot loader (grub for example), so we make sure to start + * at an offset of at least 1MB. + */ + min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + search_start = max(search_start, min_search_start); path = btrfs_alloc_path(); if (!path) @@ -1271,7 +1281,7 @@ again: goto out; } - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; @@ -1397,18 +1407,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { - struct btrfs_root *root = device->dev_root; - u64 search_start; - /* FIXME use last free of some kind */ - - /* - * we don't want to overwrite the superblock on the drive, - * so we make sure to start at an offset of at least 1MB - */ - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); return find_free_dev_extent_start(trans->transaction, device, - num_bytes, search_start, start, len); + num_bytes, 0, start, len); } static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, @@ -1642,7 +1643,6 @@ static void update_dev_time(char *path_name) return; file_update_time(filp); filp_close(filp, NULL); - return; } static int btrfs_rm_dev_item(struct btrfs_root *root, @@ -3406,7 +3406,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) list_for_each_entry(device, devices, dev_list) { old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); - size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + size_to_free = min_t(u64, size_to_free, SZ_1M); if (!device->writeable || btrfs_device_get_total_bytes(device) - btrfs_device_get_bytes_used(device) > size_to_free || @@ -3723,14 +3723,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } - /* allow dup'ed data chunks only in mixed mode */ - if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { - btrfs_err(fs_info, "dup for data is not allowed"); - ret = -EINVAL; - goto out; - } - /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | @@ -3756,6 +3748,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); + if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < + btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + btrfs_warn(fs_info, + "metatdata profile 0x%llx has lower redundancy than data profile 0x%llx", + bctl->meta.target, bctl->data.target); + } + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { fs_info->num_tolerated_disk_barrier_failures = min( btrfs_calc_num_tolerated_disk_barrier_failures(fs_info), @@ -4268,7 +4267,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; lock_chunks(root); @@ -4460,7 +4459,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b) static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) { /* TODO allow them to set a preferred stripe size */ - return 64 * 1024; + return SZ_64K; } static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) @@ -4528,21 +4527,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ncopies = btrfs_raid_array[index].ncopies; if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = 1024 * 1024 * 1024; + max_stripe_size = SZ_1G; max_chunk_size = 10 * max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) - max_stripe_size = 1024 * 1024 * 1024; + if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) + max_stripe_size = SZ_1G; else - max_stripe_size = 256 * 1024 * 1024; + max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 32 * 1024 * 1024; + max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; @@ -4793,7 +4792,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 dev_offset; u64 stripe_size; int i = 0; - int ret; + int ret = 0; em_tree = &extent_root->fs_info->mapping_tree.map_tree; read_lock(&em_tree->lock); @@ -4824,20 +4823,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with the map's stripes, because the device object's id can change + * at any time during that final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; dev_offset = map->stripes[i].physical; ret = btrfs_update_device(trans, device); if (ret) - goto out; + break; ret = btrfs_alloc_dev_extent(trans, device, chunk_root->root_key.objectid, BTRFS_FIRST_CHUNK_TREE_OBJECTID, chunk_offset, dev_offset, stripe_size); if (ret) - goto out; + break; + } + if (ret) { + mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); + goto out; } stripe = &chunk->stripe; @@ -4850,6 +4861,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; } + mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); btrfs_set_stack_chunk_length(chunk, chunk_size); btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); @@ -6465,11 +6477,11 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; - btrfs_set_buffer_uptodate(sb); + set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* * The sb extent buffer is artifical and just used to read the system array. - * btrfs_set_buffer_uptodate() call does not properly mark all it's + * set_extent_buffer_uptodate() call does not properly mark all it's * pages up-to-date when the page is larger: extent does not cover the * whole page and consequently check_page_uptodate does not find all * the page's extents up-to-date (the hole beyond sb), @@ -6512,6 +6524,14 @@ int btrfs_read_sys_array(struct btrfs_root *root) goto out_short_read; num_stripes = btrfs_chunk_num_stripes(sb, chunk); + if (!num_stripes) { + printk(KERN_ERR + "BTRFS: invalid number of stripes %u in sys_array at offset %u\n", + num_stripes, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; @@ -6520,6 +6540,9 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (ret) break; } else { + printk(KERN_ERR + "BTRFS: unexpected item type %u in sys_array at offset %u\n", + (u32)key.type, cur_offset); ret = -EIO; break; } @@ -6949,7 +6972,7 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) } } -void btrfs_close_one_device(struct btrfs_device *device) +static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; struct btrfs_device *new_device; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d5c84f6b1353..1939ebde63df 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -26,7 +26,7 @@ extern struct mutex uuid_mutex; -#define BTRFS_STRIPE_LEN (64 * 1024) +#define BTRFS_STRIPE_LEN SZ_64K struct buffer_head; struct btrfs_pending_bios { @@ -566,6 +566,5 @@ static inline void unlock_chunks(struct btrfs_root *root) struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); -void btrfs_close_one_device(struct btrfs_device *device); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 7cbef1a14fe1..fd953c361a43 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -283,7 +283,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; /* search for our xattrs */ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -446,7 +446,7 @@ static int btrfs_initxattrs(struct inode *inode, for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + - strlen(xattr->name) + 1, GFP_NOFS); + strlen(xattr->name) + 1, GFP_KERNEL); if (!name) { err = -ENOMEM; break; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f446afada328..ca4d5e8457f1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -639,8 +639,8 @@ static int __init init_caches(void) ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - ceph_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ceph_inode_init_once); if (ceph_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index b7fcb3151103..c4c1169814b2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1092,7 +1092,7 @@ cifs_init_inodecache(void) cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", sizeof(struct cifsInodeInfo), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), cifs_init_once); if (cifs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0068e82217c3..0a2752b79e72 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, * should have access to this page, we're safe to simply set * PG_locked without checking it first. */ - __set_page_locked(page); + __SetPageLocked(page); rc = add_to_page_cache_locked(page, mapping, page->index, gfp); /* give up if we can't stick it in the cache */ if (rc) { - __clear_page_locked(page); + __ClearPageLocked(page); return rc; } @@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, if (*bytes + PAGE_CACHE_SIZE > rsize) break; - __set_page_locked(page); + __SetPageLocked(page); if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { - __clear_page_locked(page); + __ClearPageLocked(page); break; } list_move_tail(&page->lru, tmplist); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index cac1390b87a3..57e81cbba0fa 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -74,9 +74,9 @@ static void init_once(void *foo) int __init coda_init_inodecache(void) { coda_inode_cachep = kmem_cache_create("coda_inode_cache", - sizeof(struct coda_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct coda_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (coda_inode_cachep == NULL) return -ENOMEM; return 0; @@ -28,54 +28,68 @@ #include <linux/sched.h> #include <linux/uio.h> #include <linux/vmstat.h> +#include <linux/pfn_t.h> +#include <linux/sizes.h> + +static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) +{ + struct request_queue *q = bdev->bd_queue; + long rc = -EIO; + + dax->addr = (void __pmem *) ERR_PTR(-EIO); + if (blk_queue_enter(q, true) != 0) + return rc; + + rc = bdev_direct_access(bdev, dax); + if (rc < 0) { + dax->addr = (void __pmem *) ERR_PTR(rc); + blk_queue_exit(q); + return rc; + } + return rc; +} + +static void dax_unmap_atomic(struct block_device *bdev, + const struct blk_dax_ctl *dax) +{ + if (IS_ERR(dax->addr)) + return; + blk_queue_exit(bdev->bd_queue); +} /* * dax_clear_blocks() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS * semantics for all operations. */ -int dax_clear_blocks(struct inode *inode, sector_t block, long size) +int dax_clear_blocks(struct inode *inode, sector_t block, long _size) { struct block_device *bdev = inode->i_sb->s_bdev; - sector_t sector = block << (inode->i_blkbits - 9); + struct blk_dax_ctl dax = { + .sector = block << (inode->i_blkbits - 9), + .size = _size, + }; might_sleep(); do { - void __pmem *addr; - unsigned long pfn; - long count; + long count, sz; - count = bdev_direct_access(bdev, sector, &addr, &pfn, size); + count = dax_map_atomic(bdev, &dax); if (count < 0) return count; - BUG_ON(size < count); - while (count > 0) { - unsigned pgsz = PAGE_SIZE - offset_in_page(addr); - if (pgsz > count) - pgsz = count; - clear_pmem(addr, pgsz); - addr += pgsz; - size -= pgsz; - count -= pgsz; - BUG_ON(pgsz & 511); - sector += pgsz / 512; - cond_resched(); - } - } while (size); + sz = min_t(long, count, SZ_128K); + clear_pmem(dax.addr, sz); + dax.size -= sz; + dax.sector += sz / 512; + dax_unmap_atomic(bdev, &dax); + cond_resched(); + } while (dax.size); wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_clear_blocks); -static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, - unsigned blkbits) -{ - unsigned long pfn; - sector_t sector = bh->b_blocknr << (blkbits - 9); - return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); -} - /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, loff_t pos, loff_t end) @@ -105,19 +119,29 @@ static bool buffer_size_valid(struct buffer_head *bh) return bh->b_state != 0; } + +static sector_t to_sector(const struct buffer_head *bh, + const struct inode *inode) +{ + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + + return sector; +} + static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { - ssize_t retval = 0; - loff_t pos = start; - loff_t max = start; - loff_t bh_max = start; - void __pmem *addr; - bool hole = false; - bool need_wmb = false; - - if (iov_iter_rw(iter) != WRITE) + loff_t pos = start, max = start, bh_max = start; + bool hole = false, need_wmb = false; + struct block_device *bdev = NULL; + int rw = iov_iter_rw(iter), rc; + long map_len = 0; + struct blk_dax_ctl dax = { + .addr = (void __pmem *) ERR_PTR(-EIO), + }; + + if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { @@ -132,13 +156,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; - retval = get_block(inode, block, bh, - iov_iter_rw(iter) == WRITE); - if (retval) + rc = get_block(inode, block, bh, rw == WRITE); + if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; + bdev = bh->b_bdev; } else { unsigned done = bh->b_size - (bh_max - (pos - first)); @@ -146,47 +170,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size -= done; } - hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); + hole = rw == READ && !buffer_written(bh); if (hole) { - addr = NULL; size = bh->b_size - first; } else { - retval = dax_get_addr(bh, &addr, blkbits); - if (retval < 0) + dax_unmap_atomic(bdev, &dax); + dax.sector = to_sector(bh, inode); + dax.size = bh->b_size; + map_len = dax_map_atomic(bdev, &dax); + if (map_len < 0) { + rc = map_len; break; + } if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(addr, retval, first, pos, - end); + dax_new_buf(dax.addr, map_len, first, + pos, end); need_wmb = true; } - addr += first; - size = retval - first; + dax.addr += first; + size = map_len - first; } max = min(pos + size, end); } if (iov_iter_rw(iter) == WRITE) { - len = copy_from_iter_pmem(addr, max - pos, iter); + len = copy_from_iter_pmem(dax.addr, max - pos, iter); need_wmb = true; } else if (!hole) - len = copy_to_iter((void __force *)addr, max - pos, + len = copy_to_iter((void __force *) dax.addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); if (!len) { - retval = -EFAULT; + rc = -EFAULT; break; } pos += len; - addr += len; + if (!IS_ERR(dax.addr)) + dax.addr += len; } if (need_wmb) wmb_pmem(); + dax_unmap_atomic(bdev, &dax); - return (pos == start) ? retval : pos - start; + return (pos == start) ? rc : pos - start; } /** @@ -275,28 +305,35 @@ static int dax_load_hole(struct address_space *mapping, struct page *page, return VM_FAULT_LOCKED; } -static int copy_user_bh(struct page *to, struct buffer_head *bh, - unsigned blkbits, unsigned long vaddr) +static int copy_user_bh(struct page *to, struct inode *inode, + struct buffer_head *bh, unsigned long vaddr) { - void __pmem *vfrom; + struct blk_dax_ctl dax = { + .sector = to_sector(bh, inode), + .size = bh->b_size, + }; + struct block_device *bdev = bh->b_bdev; void *vto; - if (dax_get_addr(bh, &vfrom, blkbits) < 0) - return -EIO; + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); vto = kmap_atomic(to); - copy_user_page(vto, (void __force *)vfrom, vaddr, to); + copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); + dax_unmap_atomic(bdev, &dax); return 0; } static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct address_space *mapping = inode->i_mapping; - sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); unsigned long vaddr = (unsigned long)vmf->virtual_address; - void __pmem *addr; - unsigned long pfn; + struct address_space *mapping = inode->i_mapping; + struct block_device *bdev = bh->b_bdev; + struct blk_dax_ctl dax = { + .sector = to_sector(bh, inode), + .size = bh->b_size, + }; pgoff_t size; int error; @@ -315,20 +352,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, goto out; } - error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); - if (error < 0) - goto out; - if (error < PAGE_SIZE) { - error = -EIO; + if (dax_map_atomic(bdev, &dax) < 0) { + error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { - clear_pmem(addr, PAGE_SIZE); + clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } + dax_unmap_atomic(bdev, &dax); - error = vm_insert_mixed(vma, vaddr, pfn); + error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); @@ -422,7 +457,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) - error = copy_user_bh(new_page, &bh, blkbits, vaddr); + error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) @@ -523,6 +558,24 @@ EXPORT_SYMBOL_GPL(dax_fault); */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) +static void __dax_dbg(struct buffer_head *bh, unsigned long address, + const char *reason, const char *fn) +{ + if (bh) { + char bname[BDEVNAME_SIZE]; + bdevname(bh->b_bdev, bname); + pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " + "length %zd fallback: %s\n", fn, current->comm, + address, bname, bh->b_state, (u64)bh->b_blocknr, + bh->b_size, reason); + } else { + pr_debug("%s: %s addr: %lx fallback: %s\n", fn, + current->comm, address, reason); + } +} + +#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") + int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) @@ -534,41 +587,49 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; - long length; - void __pmem *kaddr; + struct block_device *bdev; pgoff_t size, pgoff; - sector_t block, sector; - unsigned long pfn; + sector_t block; int result = 0; - /* dax pmd mappings are broken wrt gup and fork */ + /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) return VM_FAULT_FALLBACK; /* Fall back to PTEs if we're going to COW */ - if (write && !(vma->vm_flags & VM_SHARED)) + if (write && !(vma->vm_flags & VM_SHARED)) { + split_huge_pmd(vma, pmd, address); + dax_pmd_dbg(NULL, address, "cow write"); return VM_FAULT_FALLBACK; + } /* If the PMD would extend outside the VMA */ - if (pmd_addr < vma->vm_start) + if (pmd_addr < vma->vm_start) { + dax_pmd_dbg(NULL, address, "vma start unaligned"); return VM_FAULT_FALLBACK; - if ((pmd_addr + PMD_SIZE) > vma->vm_end) + } + if ((pmd_addr + PMD_SIZE) > vma->vm_end) { + dax_pmd_dbg(NULL, address, "vma end unaligned"); return VM_FAULT_FALLBACK; + } pgoff = linear_page_index(vma, pmd_addr); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) return VM_FAULT_SIGBUS; /* If the PMD would cover blocks out of the file */ - if ((pgoff | PG_PMD_COLOUR) >= size) + if ((pgoff | PG_PMD_COLOUR) >= size) { + dax_pmd_dbg(NULL, address, + "offset + huge page size > file size"); return VM_FAULT_FALLBACK; + } memset(&bh, 0, sizeof(bh)); block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - length = get_block(inode, block, &bh, write); - if (length) + if (get_block(inode, block, &bh, write) != 0) return VM_FAULT_SIGBUS; + bdev = bh.b_bdev; i_mmap_lock_read(mapping); /* @@ -576,8 +637,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * just fall back to PTEs. Calling get_block 512 times in a loop * would be silly. */ - if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) + if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { + dax_pmd_dbg(&bh, address, "allocated block too small"); goto fallback; + } /* * If we allocated new storage, make sure no process has any @@ -600,57 +663,82 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, result = VM_FAULT_SIGBUS; goto out; } - if ((pgoff | PG_PMD_COLOUR) >= size) + if ((pgoff | PG_PMD_COLOUR) >= size) { + dax_pmd_dbg(&bh, address, "pgoff unaligned"); goto fallback; + } if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); - if (unlikely(!zero_page)) + if (unlikely(!zero_page)) { + dax_pmd_dbg(&bh, address, "no zero page"); goto fallback; + } ptl = pmd_lock(vma->vm_mm, pmd); if (!pmd_none(*pmd)) { spin_unlock(ptl); + dax_pmd_dbg(&bh, address, "pmd already present"); goto fallback; } + dev_dbg(part_to_dev(bdev->bd_part), + "%s: %s addr: %lx pfn: <zero> sect: %llx\n", + __func__, current->comm, address, + (unsigned long long) to_sector(&bh, inode)); + entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { - sector = bh.b_blocknr << (blkbits - 9); - length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, - bh.b_size); + struct blk_dax_ctl dax = { + .sector = to_sector(&bh, inode), + .size = PMD_SIZE, + }; + long length = dax_map_atomic(bdev, &dax); + if (length < 0) { result = VM_FAULT_SIGBUS; goto out; } - if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) + if (length < PMD_SIZE) { + dax_pmd_dbg(&bh, address, "dax-length too small"); + dax_unmap_atomic(bdev, &dax); goto fallback; + } + if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { + dax_pmd_dbg(&bh, address, "pfn unaligned"); + dax_unmap_atomic(bdev, &dax); + goto fallback; + } - /* - * TODO: teach vmf_insert_pfn_pmd() to support - * 'pte_special' for pmds - */ - if (pfn_valid(pfn)) + if (!pfn_t_devmap(dax.pfn)) { + dax_unmap_atomic(bdev, &dax); + dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; + } if (buffer_unwritten(&bh) || buffer_new(&bh)) { - int i; - for (i = 0; i < PTRS_PER_PMD; i++) - clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); + clear_pmem(dax.addr, PMD_SIZE); wmb_pmem(); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); result |= VM_FAULT_MAJOR; } - - result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); + dax_unmap_atomic(bdev, &dax); + + dev_dbg(part_to_dev(bdev->bd_part), + "%s: %s addr: %lx pfn: %lx sect: %llx\n", + __func__, current->comm, address, + pfn_t_to_pfn(dax.pfn), + (unsigned long long) dax.sector); + result |= vmf_insert_pfn_pmd(vma, address, pmd, + dax.pfn, write); } out: @@ -752,12 +840,17 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, if (err < 0) return err; if (buffer_written(&bh)) { - void __pmem *addr; - err = dax_get_addr(&bh, &addr, inode->i_blkbits); - if (err < 0) - return err; - clear_pmem(addr + offset, length); + struct block_device *bdev = bh.b_bdev; + struct blk_dax_ctl dax = { + .sector = to_sector(&bh, inode), + .size = PAGE_CACHE_SIZE, + }; + + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); + clear_pmem(dax.addr + offset, length); wmb_pmem(); + dax_unmap_atomic(bdev, &dax); } return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 8d38cd07b207..b4539e84e577 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, GFP_KERNEL); + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; @@ -3415,7 +3416,7 @@ static void __init dcache_init(void) * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 4f4d0474bee9..e25b6b06bacf 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -663,6 +663,7 @@ static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; + unsigned long flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { @@ -684,6 +685,7 @@ static struct ecryptfs_cache_info { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), + .flags = SLAB_ACCOUNT, .ctor = inode_info_init_once, }, { @@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void) struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; - *(info->cache) = kmem_cache_create(info->name, info->size, - 0, SLAB_HWCACHE_ALIGN, info->ctor); + *(info->cache) = kmem_cache_create(info->name, info->size, 0, + SLAB_HWCACHE_ALIGN | info->flags, info->ctor); if (!*(info->cache)) { ecryptfs_free_kmem_caches(); ecryptfs_printk(KERN_WARNING, "%s: " diff --git a/fs/efs/super.c b/fs/efs/super.c index c8411a30f7da..cb68dac4f9d3 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -94,9 +94,9 @@ static void init_once(void *foo) static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", - sizeof(struct efs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct efs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/eventfd.c b/fs/eventfd.c index 8d0c0df01854..ed70cf9fdc7b 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -45,10 +45,10 @@ struct eventfd_ctx { * * This function is supposed to be called by the kernel in paths that do not * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returining a POLLERR + * value, and we signal this as overflow condition by returning a POLLERR * to poll(2). * - * Returns the amount by which the counter was incrememnted. This will be less + * Returns the amount by which the counter was incremented. This will be less * than @n if the counter has overflowed. */ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) diff --git a/fs/exofs/super.c b/fs/exofs/super.c index b795c567b5e1..6658a50530a0 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -194,8 +194,8 @@ static int init_inodecache(void) { exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", sizeof(struct exofs_i_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - exofs_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, exofs_init_once); if (exofs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 748d35afc902..2a188413a2b0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -203,7 +203,7 @@ static int __init init_inodecache(void) ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", sizeof(struct ext2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext2_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c9ab67da6e5a..f1b56ff01208 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -966,7 +966,7 @@ static int __init init_inodecache(void) ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f661d80474be..3842af954cd5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -237,7 +237,7 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); - if (wbc->for_reclaim) + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, META, WRITE); return 0; @@ -410,13 +410,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, type); } -void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); @@ -434,7 +434,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_dirty_inode(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi) { struct ino_entry *e, *tmp; int i; @@ -722,47 +722,48 @@ fail_no_cp: return -EINVAL; } -static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) +static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) - return -EEXIST; + if (is_inode_flag_set(fi, flag)) + return; - set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - F2FS_I(inode)->dirty_dir = new; - list_add_tail(&new->list, &sbi->dir_inode_list); - stat_inc_dirty_dir(sbi); - return 0; + set_inode_flag(fi, flag); + list_add_tail(&fi->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || + !is_inode_flag_set(F2FS_I(inode), flag)) + return; + + list_del_init(&fi->dirty_list); + clear_inode_flag(fi, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new; - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; - if (!S_ISDIR(inode->i_mode)) { - inode_inc_dirty_pages(inode); - goto out; - } - - new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); + spin_lock(&sbi->inode_lock[type]); + __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); - spin_unlock(&sbi->dir_inode_lock); + spin_unlock(&sbi->inode_lock[type]); - if (ret) - kmem_cache_free(inode_entry_slab, new); -out: SetPagePrivate(page); f2fs_trace_pid(page); } @@ -770,70 +771,60 @@ out: void add_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new = - f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - int ret = 0; - - new->inode = inode; - INIT_LIST_HEAD(&new->list); - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); - spin_unlock(&sbi->dir_inode_lock); - - if (ret) - kmem_cache_free(inode_entry_slab, new); + spin_lock(&sbi->inode_lock[DIR_INODE]); + __add_dirty_inode(inode, DIR_INODE); + spin_unlock(&sbi->inode_lock[DIR_INODE]); } -void remove_dirty_dir_inode(struct inode *inode) +void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *entry; - - if (!S_ISDIR(inode->i_mode)) - return; + struct f2fs_inode_info *fi = F2FS_I(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - spin_lock(&sbi->dir_inode_lock); - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { - spin_unlock(&sbi->dir_inode_lock); + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; - } - entry = F2FS_I(inode)->dirty_dir; - list_del(&entry->list); - F2FS_I(inode)->dirty_dir = NULL; - clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - stat_dec_dirty_dir(sbi); - spin_unlock(&sbi->dir_inode_lock); - kmem_cache_free(inode_entry_slab, entry); + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); /* Only from the recovery routine */ - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); + if (is_inode_flag_set(fi, FI_DELAY_IPUT)) { + clear_inode_flag(fi, FI_DELAY_IPUT); iput(inode); } } -void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; - struct inode_entry *entry; struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - spin_lock(&sbi->dir_inode_lock); + spin_lock(&sbi->inode_lock[type]); - head = &sbi->dir_inode_list; + head = &sbi->inode_list[type]; if (list_empty(head)) { - spin_unlock(&sbi->dir_inode_lock); - return; + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; } - entry = list_entry(head->next, struct inode_entry, list); - inode = igrab(entry->inode); - spin_unlock(&sbi->dir_inode_lock); + fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); if (inode) { filemap_fdatawrite(inode->i_mapping); iput(inode); @@ -868,11 +859,9 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - sync_dirty_dir_inodes(sbi); - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; + err = sync_dirty_inodes(sbi, DIR_INODE); + if (err) goto out; - } goto retry_flush_dents; } @@ -885,10 +874,9 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - sync_node_pages(sbi, 0, &wbc); - if (unlikely(f2fs_cp_error(sbi))) { + err = sync_node_pages(sbi, 0, &wbc); + if (err) { f2fs_unlock_all(sbi); - err = -EIO; goto out; } goto retry_flush_nodes; @@ -919,7 +907,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -945,7 +933,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; } next_free_nid(sbi, &last_nid); @@ -1030,7 +1018,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1058,7 +1046,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); @@ -1081,22 +1069,25 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, discard_blk); - release_dirty_inode(sbi); + release_ino_entry(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + + return 0; } /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + int err = 0; mutex_lock(&sbi->cp_mutex); @@ -1104,14 +1095,19 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto out; - if (f2fs_readonly(sbi->sb)) + } + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; goto out; + } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - if (block_operations(sbi)) + err = block_operations(sbi); + if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); @@ -1133,7 +1129,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, cpc); + err = do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1143,10 +1139,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) "checkpoint: version = %llx", ckpt_ver); /* do checkpoint periodically */ - sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); + return err; } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 972eab7ac071..ac9e7c6aac74 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -225,7 +225,8 @@ void set_data_blkaddr(struct dnode_of_data *dn) /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - set_page_dirty(node_page); + if (set_page_dirty(node_page)) + dn->node_changed = true; } int reserve_new_block(struct dnode_of_data *dn) @@ -412,7 +413,7 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; -repeat: + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* @@ -441,12 +442,11 @@ repeat: } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC, true); + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - goto repeat; - - /* wait for read completion */ - lock_page(page); + return page; } got_it: if (new_i_size && i_size_read(inode) < @@ -494,14 +494,10 @@ alloc: if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) i_size_write(dn->inode, ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); - - /* direct IO doesn't use extent cache to maximize the performance */ - f2fs_drop_largest_extent(dn->inode, fofs); - return 0; } -static void __allocate_data_blocks(struct inode *inode, loff_t offset, +static int __allocate_data_blocks(struct inode *inode, loff_t offset, size_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -510,14 +506,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, u64 len = F2FS_BYTES_TO_BLK(count); bool allocated; u64 end_offset; + int err = 0; while (len) { - f2fs_balance_fs(sbi); f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, ALLOC_NODE)) + err = get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) goto out; allocated = false; @@ -526,12 +523,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, while (dn.ofs_in_node < end_offset && len) { block_t blkaddr; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto sync_out; + } blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { - if (__allocate_data_block(&dn)) + err = __allocate_data_block(&dn); + if (err) goto sync_out; allocated = true; } @@ -545,8 +545,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); } - return; + return err; sync_out: if (allocated) @@ -554,7 +556,8 @@ sync_out: f2fs_put_dnode(&dn); out: f2fs_unlock_op(sbi); - return; + f2fs_balance_fs(sbi, dn.node_changed); + return err; } /* @@ -566,7 +569,7 @@ out: * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) { unsigned int maxblocks = map->m_len; @@ -577,6 +580,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; struct extent_info ei; bool allocated = false; + block_t blkaddr; map->m_len = 0; map->m_flags = 0; @@ -592,7 +596,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } if (create) - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -640,12 +644,21 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs++; get_next: + if (map->m_len >= maxblocks) + goto sync_out; + if (dn.ofs_in_node >= end_offset) { if (allocated) sync_inode_page(&dn); allocated = false; f2fs_put_dnode(&dn); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + f2fs_lock_op(sbi); + } + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { @@ -657,52 +670,53 @@ get_next: end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } - if (maxblocks > map->m_len) { - block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { - if (create) { - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; - goto sync_out; - } - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; - } else { - /* - * we only merge preallocated unwritten blocks - * for fiemap. - */ - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) - goto sync_out; + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; } + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + map->m_flags |= F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; + } else { + /* + * we only merge preallocated unwritten blocks + * for fiemap. + */ + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; } + } - /* Give more consecutive addresses for the readahead */ - if ((map->m_pblk != NEW_ADDR && - blkaddr == (map->m_pblk + ofs)) || - (map->m_pblk == NEW_ADDR && - blkaddr == NEW_ADDR)) { - ofs++; - dn.ofs_in_node++; - pgofs++; - map->m_len++; - goto get_next; - } + /* Give more consecutive addresses for the readahead */ + if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && + blkaddr == NEW_ADDR)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + map->m_len++; + goto get_next; } + sync_out: if (allocated) sync_inode_page(&dn); put_out: f2fs_put_dnode(&dn); unlock_out: - if (create) - f2fs_unlock_op(F2FS_I_SB(inode)); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + } out: trace_f2fs_map_blocks(inode, map, err); return err; @@ -742,6 +756,10 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) + return -EFBIG; + return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_BMAP); } @@ -761,10 +779,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { struct buffer_head map_bh; sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); + loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; - bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); @@ -779,16 +796,19 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, mutex_lock(&inode->i_mutex); - if (len >= isize) { - whole_file = true; - len = isize; - } + isize = i_size_read(inode); + if (start >= isize) + goto out; + + if (start + len > isize) + len = isize - start; if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); + next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; @@ -800,59 +820,37 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { - start_blk++; - - if (!past_eof && blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - if (past_eof && size) { - flags |= FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } + /* Go through holes util pass the EOF */ + if (blk_to_logical(inode, start_blk++) < isize) + goto prep_next; + /* Found a hole beyond isize means no more extents. + * Note that the premise is that filesystems don't + * punch holes beyond isize and keep size unchanged. + */ + flags |= FIEMAP_EXTENT_LAST; + } - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - goto out; - } else { - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - goto out; - } + if (size) { + if (f2fs_encrypted_inode(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; - /* - * if size != 0 then we know we already have an extent - * to add, so add it. - */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - if (ret) - goto out; - } + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = 0; - if (buffer_unwritten(&map_bh)) - flags = FIEMAP_EXTENT_UNWRITTEN; + if (start_blk > last_blk || ret) + goto out; - start_blk += logical_to_blk(inode, size); + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; - } + start_blk += logical_to_blk(inode, size); + +prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; @@ -1083,6 +1081,7 @@ int do_write_data_page(struct f2fs_io_info *fio) */ if (unlikely(fio->blk_addr != NEW_ADDR && !is_cold_data(page) && + !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); @@ -1179,10 +1178,11 @@ out: if (err) ClearPageUptodate(page); unlock_page(page); - if (need_balance_fs) - f2fs_balance_fs(sbi); - if (wbc->for_reclaim) + f2fs_balance_fs(sbi, need_balance_fs); + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); + remove_dirty_inode(inode); + } return 0; redirty_out: @@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* skip writing during file defragment */ + if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; @@ -1369,7 +1373,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (locked) mutex_unlock(&sbi->writepages); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; @@ -1382,13 +1386,85 @@ skip_write: static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + loff_t i_size = i_size_read(inode); - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - truncate_blocks(inode, inode->i_size, true); + if (to > i_size) { + truncate_pagecache(inode, i_size); + truncate_blocks(inode, i_size, true); } } +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei; + int err = 0; + + if (f2fs_has_inline_data(inode) || + (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + f2fs_lock_op(sbi); + locked = true; + } +restart: + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + sync_inode_page(&dn); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + bool restart = false; + + /* hole case */ + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || (!err && dn.data_blkaddr == NULL_ADDR)) + restart = true; + if (restart) { + f2fs_put_dnode(&dn); + f2fs_lock_op(sbi); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_unlock_op(sbi); + return err; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1396,15 +1472,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; - struct page *ipage; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; - struct dnode_of_data dn; + bool need_balance = false; + block_t blkaddr = NULL_ADDR; int err = 0; trace_f2fs_write_begin(inode, pos, len, flags); - f2fs_balance_fs(sbi); - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1424,41 +1498,27 @@ repeat: *pagep = page; - f2fs_lock_op(sbi); - - /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto unlock_fail; - } - - set_new_dnode(&dn, inode, ipage, ipage, 0); + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + if (err) + goto fail; - if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { - read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - sync_inode_page(&dn); - goto put_next; + if (need_balance && has_not_enough_free_secs(sbi, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; } - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto put_fail; } - err = f2fs_get_block(&dn, index); - if (err) - goto put_fail; -put_next: - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - f2fs_wait_on_page_writeback(page, DATA); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); if (len == PAGE_CACHE_SIZE) goto out_update; @@ -1474,14 +1534,14 @@ put_next: goto out_update; } - if (dn.data_blkaddr == NEW_ADDR) { + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, + .blk_addr = blkaddr, .page = page, .encrypted_page = NULL, }; @@ -1512,10 +1572,6 @@ out_clear: clear_cold_data(page); return 0; -put_fail: - f2fs_put_dnode(&dn); -unlock_fail: - f2fs_unlock_op(sbi); fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); @@ -1540,6 +1596,7 @@ static int f2fs_write_end(struct file *file, } f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1567,11 +1624,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int err; /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; @@ -1583,11 +1638,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == WRITE) { - __allocate_data_blocks(inode, offset, count); - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { - err = -EIO; + err = __allocate_data_blocks(inode, offset, count); + if (err) goto out; - } } err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ad1b18a7705b..4fb6ef88a34f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -38,12 +38,15 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = sbi->total_ext_tree; + si->ext_tree = atomic_read(&sbi->total_ext_tree); + si->zombie_tree = atomic_read(&sbi->total_zombie_tree); si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); - si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; @@ -105,7 +108,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); @@ -189,10 +192,10 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); for (i = 0; i <= UPDATE_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree); si->cache_mem += atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node); @@ -267,7 +270,8 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); - seq_printf(s, "CP calls: %d\n", si->cp_count); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -288,8 +292,8 @@ static int stat_show(struct seq_file *s, void *v) !si->total_ext ? 0 : div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); - seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", - si->ext_tree, si->ext_node); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - inmem: %4d, wb: %4d\n", si->inmem_pages, si->wb_pages); @@ -297,6 +301,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); + seq_printf(s, " - datas: %4d in files:%4d\n", + si->ndirty_data, si->ndirty_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", @@ -404,20 +410,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(si); } -void __init f2fs_create_root_stats(void) +int __init f2fs_create_root_stats(void) { struct dentry *file; f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - return; + return -ENOMEM; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); if (!file) { debugfs_remove(f2fs_debugfs_root); f2fs_debugfs_root = NULL; + return -ENOMEM; } + + return 0; } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7c1678ba8f92..faa7495e2d7e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -172,8 +172,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, namehash = f2fs_dentry_hash(&name); - f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -238,6 +236,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, goto out; max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING, + "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + F2FS_I(dir)->i_current_depth = max_depth; + mark_inode_dirty(dir); + } for (level = 0; level < max_depth; level++) { de = find_in_level(dir, level, &fname, res_page); @@ -444,7 +450,7 @@ error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0, false); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); remove_inode_page(inode); return ERR_PTR(err); } @@ -630,6 +636,7 @@ fail: f2fs_put_page(dentry_page, 1); out: f2fs_fname_free_filename(&fname); + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; } @@ -651,6 +658,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } @@ -695,6 +703,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -855,25 +865,27 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n, false); - if (IS_ERR(dentry_page)) - continue; + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) + continue; + else + goto out; + } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) - goto stop; + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + break; + } ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - dentry_page = NULL; - } -stop: - if (dentry_page && !IS_ERR(dentry_page)) { - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); } out: f2fs_fname_crypto_free_buffer(&fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 7ddba812e11b..ccd5c636d3fe 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -36,7 +36,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color(&en->rb_node, &et->root); - et->count++; + atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; } @@ -45,7 +45,7 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { rb_erase(&en->rb_node, &et->root); - et->count--; + atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); if (et->cached_en == en) @@ -68,11 +68,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) et->root = RB_ROOT; et->cached_en = NULL; rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&sbi->total_ext_tree); + } else { + atomic_dec(&sbi->total_zombie_tree); + list_del_init(&et->list); } - atomic_inc(&et->refcount); up_write(&sbi->extent_tree_lock); /* never died until evict_inode */ @@ -131,7 +133,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = et->count; + unsigned int count = atomic_read(&et->node_cnt); node = rb_first(&et->root); while (node) { @@ -152,7 +154,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, node = next; } - return count - et->count; + return count - atomic_read(&et->node_cnt); } static void __drop_largest_extent(struct inode *inode, @@ -164,34 +166,33 @@ static void __drop_largest_extent(struct inode *inode, largest->len = 0; } -void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) -{ - if (!f2fs_may_extent_tree(inode)) - return; - - __drop_largest_extent(inode, fofs, 1); -} - -void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +/* return true, if inode page is changed */ +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) - return; + if (!f2fs_may_extent_tree(inode)) { + /* drop largest extent */ + if (i_ext && i_ext->len) { + i_ext->len = 0; + return true; + } + return false; + } et = __grab_extent_tree(inode); - if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; + if (!i_ext || !i_ext->len) + return false; set_extent_info(&ei, le32_to_cpu(i_ext->fofs), le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); write_lock(&et->lock); - if (et->count) + if (atomic_read(&et->node_cnt)) goto out; en = __init_extent_tree(sbi, et, &ei); @@ -202,6 +203,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) } out: write_unlock(&et->lock); + return false; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -549,45 +551,44 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_tree *et, *next; struct extent_node *en, *tmp; unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_root *root = &sbi->extent_tree_root; unsigned int found; unsigned int node_cnt = 0, tree_cnt = 0; int remained; + bool do_free = false; if (!test_opt(sbi, EXTENT_CACHE)) return 0; + if (!atomic_read(&sbi->total_zombie_tree)) + goto free_node; + if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - if (!atomic_read(&et->refcount)) { - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); + list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + } - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; + list_del_init(&et->list); + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + atomic_dec(&sbi->total_zombie_tree); + tree_cnt++; - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } - } + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; } up_write(&sbi->extent_tree_lock); +free_node: /* 2. remove LRU extent entries */ if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; @@ -599,15 +600,19 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!remained--) break; list_del_init(&en->list); + do_free = true; } spin_unlock(&sbi->extent_lock); + if (do_free == false) + goto unlock_out; + /* * reset ino for searching victims from beginning of global extent tree. */ ino = F2FS_ROOT_INO(sbi); - while ((found = radix_tree_gang_lookup(root, + while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; @@ -615,9 +620,13 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) for (i = 0; i < found; i++) { struct extent_tree *et = treevec[i]; - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); + if (!atomic_read(&et->node_cnt)) + continue; + + if (write_trylock(&et->lock)) { + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + } if (node_cnt + tree_cnt >= nr_shrink) goto unlock_out; @@ -637,7 +646,7 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) struct extent_tree *et = F2FS_I(inode)->extent_tree; unsigned int node_cnt = 0; - if (!et) + if (!et || !atomic_read(&et->node_cnt)) return 0; write_lock(&et->lock); @@ -656,8 +665,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (!et) return; - if (inode->i_nlink && !is_bad_inode(inode) && et->count) { - atomic_dec(&et->refcount); + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + down_write(&sbi->extent_tree_lock); + list_add_tail(&et->list, &sbi->zombie_list); + atomic_inc(&sbi->total_zombie_tree); + up_write(&sbi->extent_tree_lock); return; } @@ -666,11 +679,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) /* delete extent tree entry in radix tree */ down_write(&sbi->extent_tree_lock); - atomic_dec(&et->refcount); - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; + atomic_dec(&sbi->total_ext_tree); up_write(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -722,7 +734,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) init_rwsem(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_tree, 0); + INIT_LIST_HEAD(&sbi->zombie_list); + atomic_set(&sbi->total_zombie_tree, 0); atomic_set(&sbi->total_ext_node, 0); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ec6067c33a3f..ff79054c6cf6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/bio.h> +#include <linux/blkdev.h> #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -54,6 +55,7 @@ #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -125,6 +127,7 @@ enum { #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 120 /* 2 mins */ struct cp_control { int reason; @@ -158,13 +161,7 @@ struct ino_entry { nid_t ino; /* inode number */ }; -/* - * for the list of directory inodes or gc inodes. - * NOTE: there are two slab users for this structure, if we add/modify/delete - * fields in structure for one of slab users, it may affect fields or size of - * other one, in this condition, it's better to split both of slab and related - * data structure. - */ +/* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ @@ -234,6 +231,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) @@ -256,10 +254,16 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, /* * ioctl commands in 32 bit emulation */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_defragment { + u64 start; + u64 len; +}; + /* * For INODE and NODE manager */ @@ -357,9 +361,9 @@ struct extent_tree { struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ + struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ - atomic_t refcount; /* reference count of rb-tree */ - unsigned int count; /* # of extent node in rb-tree*/ + atomic_t node_cnt; /* # of extent node in rb-tree*/ }; /* @@ -434,8 +438,8 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + struct list_head dirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ @@ -544,6 +548,7 @@ struct dnode_of_data { nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ block_t data_blkaddr; /* block address of the node block */ }; @@ -647,6 +652,7 @@ struct f2fs_sm_info { enum count_type { F2FS_WRITEBACK, F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, @@ -695,6 +701,12 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + NR_INODE_TYPE, +}; + /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ @@ -711,11 +723,17 @@ enum { SBI_POR_DOING, /* recovery is doing or not */ }; +enum { + CP_TIME, + REQ_TIME, + MAX_TIME, +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ - struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ + int valid_super_block; /* valid super block no */ int s_flag; /* flags for sbi */ /* for node-related operations */ @@ -737,23 +755,26 @@ struct f2fs_sb_info { struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; - long cp_expires, cp_interval; /* next expected periodic cp */ + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ - /* for directory inode management */ - struct list_head dir_inode_list; /* dir inode list */ - spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ - int total_ext_tree; /* extent tree count */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ /* basic filesystem units */ @@ -771,6 +792,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ + loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ @@ -809,7 +831,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ - unsigned int n_dirty_dirs; /* # of dir inodes */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ @@ -824,6 +846,31 @@ struct f2fs_sb_info { unsigned int shrinker_run_no; }; +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + sbi->last_time[type] = jiffies; +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + struct timespec ts = {sbi->interval_time[type], 0}; + unsigned long interval = timespec_to_jiffies(&ts); + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) + return 0; + + return f2fs_time_over(sbi, REQ_TIME); +} + /* * Inline functions */ @@ -1059,8 +1106,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { atomic_inc(&F2FS_I(inode)->dirty_pages); - if (S_ISDIR(inode->i_mode)) - inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1075,9 +1122,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode) return; atomic_dec(&F2FS_I(inode)->dirty_pages); - - if (S_ISDIR(inode->i_mode)) - dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -1092,8 +1138,7 @@ static inline int get_dirty_pages(struct inode *inode) static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; return ((get_pages(sbi, block_type) + pages_per_sec - 1) >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; } @@ -1416,6 +1461,8 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -1659,8 +1706,8 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); -void update_inode(struct inode *, struct page *); -void update_inode_page(struct inode *); +int update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); void handle_failed_inode(struct inode *); @@ -1765,7 +1812,7 @@ void destroy_node_manager_caches(void); */ void register_inmem_page(struct inode *, struct page *); int commit_inmem_pages(struct inode *, bool); -void f2fs_balance_fs(struct f2fs_sb_info *); +void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); @@ -1811,9 +1858,9 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void release_dirty_inode(struct f2fs_sb_info *); +void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void release_ino_entry(struct f2fs_sb_info *); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); @@ -1823,9 +1870,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); -void remove_dirty_dir_inode(struct inode *); -void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void remove_dirty_inode(struct inode *); +int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); +int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1845,6 +1892,7 @@ struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1875,8 +1923,9 @@ struct f2fs_stat_info { int main_area_segs, main_area_sections, main_area_zones; unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; - int ext_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + int ext_tree, zombie_tree, ext_node; + int ndirty_node, ndirty_meta; + int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; int bg_gc, inmem_pages, wb_pages; @@ -1886,7 +1935,7 @@ struct f2fs_stat_info { int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count, cp_count; + int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; @@ -1907,10 +1956,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } #define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) -#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) -#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) #define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) @@ -1985,14 +2035,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void __init f2fs_create_root_stats(void); +int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) +#define stat_inc_bg_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) -#define stat_inc_dirty_dir(sbi) -#define stat_dec_dirty_dir(sbi) +#define stat_inc_dirty_inode(sbi, type) +#define stat_dec_dirty_inode(sbi, type) #define stat_inc_total_hit(sb) #define stat_inc_rbtree_node_hit(sb) #define stat_inc_largest_node_hit(sbi) @@ -2013,7 +2064,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void __init f2fs_create_root_stats(void) { } +static inline int __init f2fs_create_root_stats(void) { return 0; } static inline void f2fs_destroy_root_stats(void) { } #endif @@ -2067,8 +2118,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); * extent_cache.c */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_drop_largest_extent(struct inode *, pgoff_t); -void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a197215ad52b..18ddb1e5182a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -40,8 +40,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; - f2fs_balance_fs(sbi); - sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -57,6 +55,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + file_update_time(vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || @@ -96,6 +96,7 @@ mapped: clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); + f2fs_update_time(sbi, REQ_TIME); return block_page_mkwrite_return(err); } @@ -201,7 +202,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ - if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(fi, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); clear_inode_flag(fi, FI_NEED_IPU); @@ -233,9 +234,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } go_write: - /* guarantee free sections for fsync */ - f2fs_balance_fs(sbi); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. @@ -261,8 +259,10 @@ sync_nodes: sync_node_pages(sbi, ino, &wbc); /* if cp_error was enabled, we should avoid infinite loop */ - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto out; + } if (need_inode_block_update(sbi, ino)) { mark_inode_dirty_sync(inode); @@ -275,12 +275,13 @@ sync_nodes: goto out; /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, ino, APPEND_INO); + remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(fi, FI_APPEND_WRITE); flush_out: - remove_dirty_inode(sbi, ino, UPDATE_INO); + remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(fi, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); + f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); @@ -418,19 +419,18 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + int err; if (f2fs_encrypted_inode(inode)) { - int err = f2fs_get_encryption_info(inode); + err = f2fs_get_encryption_info(inode); if (err) return 0; } /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; @@ -483,11 +483,11 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) F2FS_I(dn->inode)) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - set_page_dirty(dn->node_page); sync_inode_page(dn); } dn->ofs_in_node = ofs; + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); return nr_free; @@ -604,7 +604,7 @@ int f2fs_truncate(struct inode *inode, bool lock) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { + if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -679,13 +679,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode, true); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + + /* should convert inline inode here */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } @@ -727,7 +734,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (!len) return 0; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); @@ -778,13 +785,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; - int ret = 0; + int ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -815,7 +820,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; @@ -918,7 +923,7 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) int ret = 0; for (; end < nrpages; start++, end++) { - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); ret = __exchange_data_block(inode, end, start, true); f2fs_unlock_op(sbi); @@ -941,13 +946,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode)); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; pg_start = offset >> PAGE_CACHE_SHIFT; pg_end = (offset + len) >> PAGE_CACHE_SHIFT; @@ -991,13 +992,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - f2fs_balance_fs(sbi); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) @@ -1104,13 +1101,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(sbi); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + f2fs_balance_fs(sbi, true); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1154,17 +1149,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t off_start, off_end; int ret = 0; - f2fs_balance_fs(sbi); - ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -1246,6 +1239,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } out: @@ -1353,8 +1347,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - f2fs_balance_fs(F2FS_I_SB(inode)); - if (f2fs_is_atomic_file(inode)) return 0; @@ -1363,6 +1355,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) return ret; set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return 0; } @@ -1384,8 +1378,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); ret = commit_inmem_pages(inode, false); - if (ret) + if (ret) { + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); goto err_out; + } } ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); @@ -1410,6 +1406,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) return ret; set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1441,13 +1438,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode)); - - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - commit_inmem_pages(inode, true); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + commit_inmem_pages(inode, true); + } + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + } mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -1487,6 +1488,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) default: return -EINVAL; } + f2fs_update_time(sbi, REQ_TIME); return 0; } @@ -1517,6 +1519,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1540,6 +1543,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) sizeof(policy))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return f2fs_process_policy(&policy, inode); #else return -EOPNOTSUPP; @@ -1586,13 +1590,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) generate_random_uuid(sbi->raw_super->encrypt_pw_salt); err = f2fs_commit_super(sbi, false); - - mnt_drop_write_file(filp); if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + mnt_drop_write_file(filp); return err; } + mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) @@ -1629,7 +1633,6 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct cp_control cpc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1637,13 +1640,196 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - cpc.reason = __get_cp_reason(sbi); + return f2fs_sync_fs(sbi->sb, 1); +} - mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map; + struct extent_info ei; + pgoff_t pg_start, pg_end; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; + block_t blk_end = 0; + bool fragmented = false; + int err; - return 0; + /* if in-place-update policy is enabled, don't waste time here */ + if (need_inplace_update(inode)) + return -EINVAL; + + pg_start = range->start >> PAGE_CACHE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT; + + f2fs_balance_fs(sbi, true); + + mutex_lock(&inode->i_mutex); + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + if (blk_end && blk_end != map.m_pblk) { + fragmented = true; + break; + } + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) + goto out; + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + + sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, sec_num)) { + err = -EAGAIN; + goto out; + } + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; + + if (idx < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); +out: + mutex_unlock(&inode->i_mutex); + if (!err) + range->len = (u64)total << PAGE_CACHE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; + goto out; + } + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) { + err = -EFAULT; + goto out; + } + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || + range.len & (F2FS_BLKSIZE - 1)) { + err = -EINVAL; + goto out; + } + + err = f2fs_defragment_range(sbi, filp, &range); + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + goto out; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + err = -EFAULT; +out: + mnt_drop_write_file(filp); + return err; } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -1679,6 +1865,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_gc(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); default: return -ENOTTY; } @@ -1706,6 +1894,22 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_SETFLAGS: cmd = F2FS_IOC_SETFLAGS; break; + case F2FS_IOC32_GETVERSION: + cmd = F2FS_IOC_GETVERSION; + break; + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_SHUTDOWN: + case F2FS_IOC_SET_ENCRYPTION_POLICY: + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case F2FS_IOC_GET_ENCRYPTION_POLICY: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fedbf67a0842..f610c2a9bdde 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -16,7 +16,6 @@ #include <linux/kthread.h> #include <linux/delay.h> #include <linux/freezer.h> -#include <linux/blkdev.h> #include "f2fs.h" #include "node.h" @@ -173,9 +172,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return 1 << sbi->log_blocks_per_seg; + return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + return sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -832,8 +831,10 @@ gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto stop; + } if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { gc_type = FG_GC; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f7d3..a993967dcdb9 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) return true; return false; } - -static inline int is_idle(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); -} diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bda7126466c0..c3f0b7d4cfca 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -16,9 +16,6 @@ bool f2fs_may_inline_data(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) - return false; - if (f2fs_is_atomic_file(inode)) return false; @@ -177,6 +174,9 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; + if (!f2fs_has_inline_data(inode)) + return 0; + page = grab_cache_page(inode->i_mapping, 0); if (!page) return -ENOMEM; @@ -199,6 +199,9 @@ out: f2fs_unlock_op(sbi); f2fs_put_page(page, 1); + + f2fs_balance_fs(sbi, dn.node_changed); + return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5528801a5baf..2adeff26be11 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -138,7 +138,8 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, &ri->i_ext); + if (f2fs_init_extent_tree(inode, &ri->i_ext)) + set_page_dirty(node_page); get_inline_info(fi, ri); @@ -222,7 +223,7 @@ bad_inode: return ERR_PTR(ret); } -void update_inode(struct inode *inode, struct page *node_page) +int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; @@ -260,15 +261,16 @@ void update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - set_page_dirty(node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + + return set_page_dirty(node_page); } -void update_inode_page(struct inode *inode) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -279,10 +281,11 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi); } - return; + return 0; } - update_inode(inode, node_page); + ret = update_inode(inode, node_page); f2fs_put_page(node_page, 1); + return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -300,9 +303,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); - - f2fs_balance_fs(sbi); + if (update_inode_page(inode)) + f2fs_balance_fs(sbi, true); return 0; } @@ -328,7 +330,7 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); @@ -358,9 +360,9 @@ no_delete: if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (is_inode_flag_set(fi, FI_APPEND_WRITE)) - add_dirty_inode(sbi, inode->i_ino, APPEND_INO); + add_ino_entry(sbi, inode->i_ino, APPEND_INO); if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) - add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); if (is_inode_flag_set(fi, FI_FREE_NID)) { if (err && err != -ENOENT) alloc_nid_done(sbi, inode->i_ino); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e7587fce1b80..6f944e5eb76e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,7 +60,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_may_inline_data(inode)) + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); @@ -128,8 +128,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -142,6 +140,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -172,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !f2fs_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); inode->i_ctime = CURRENT_TIME; ihold(inode); @@ -214,6 +214,15 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) struct page *page; int err = 0; + if (f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "skip recovering inline_dots inode (ino:%lu, pino:%u) " + "in readonly mountpoint", dir->i_ino, pino); + return 0; + } + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); de = f2fs_find_entry(dir, &dot, &page); @@ -288,12 +297,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) int err = -ENOENT; trace_f2fs_unlink_enter(dir, dentry); - f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) goto fail; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) { @@ -344,8 +354,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -357,6 +365,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -437,8 +447,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -448,6 +456,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + f2fs_balance_fs(sbi, true); + set_inode_flag(F2FS_I(inode), FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -485,8 +495,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -494,6 +502,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -520,9 +530,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - if (!whiteout) - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -536,6 +543,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -608,8 +617,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -639,6 +646,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_whiteout; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); @@ -670,6 +679,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(old_inode); update_inode_page(new_inode); } else { + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(new_dentry, old_inode); @@ -767,8 +778,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode))) return -EPERM; - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -811,6 +820,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_new_dir; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); @@ -933,7 +944,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct f2fs_str cstr; + struct f2fs_str cstr = FSTR_INIT(NULL, 0); struct f2fs_str pstr = FSTR_INIT(NULL, 0); struct f2fs_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); @@ -956,6 +967,12 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, /* Symlink is encrypted */ sd = (struct f2fs_encrypted_symlink_data *)caddr; cstr.len = le16_to_cpu(sd->len); + + /* this is broken symlink case */ + if (unlikely(cstr.len == 0)) { + res = -ENOENT; + goto errout; + } cstr.name = kmalloc(cstr.len, GFP_NOFS); if (!cstr.name) { res = -ENOMEM; @@ -964,7 +981,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ - if (cstr.name[0] == 0 && cstr.len == 0) { + if (unlikely(cstr.name[0] == 0)) { res = -ENOENT; goto errout; } @@ -1005,10 +1022,12 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .get_link = f2fs_encrypted_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, +#endif }; #endif diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7bcbc6e9c40d..342597a5897f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -65,13 +65,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { - mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + mem_size = (atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; } return res; } @@ -261,13 +262,11 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, { struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) { e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } - up_write(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -379,6 +378,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + down_write(&nm_i->nat_tree_lock); + /* Check current segment summary */ mutex_lock(&curseg->curseg_mutex); i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); @@ -399,6 +400,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) cache: /* cache nat entry */ cache_nat_entry(NM_I(sbi), nid, &ne); + up_write(&nm_i->nat_tree_lock); } /* @@ -676,7 +678,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; } } else { child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; @@ -689,7 +692,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { goto out_err; @@ -750,7 +754,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, err = truncate_dnode(dn); if (err < 0) goto fail; - set_nid(pages[idx], i, 0, false); + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; } if (offset[idx + 1] == 0) { @@ -975,7 +980,8 @@ struct page *new_node_page(struct dnode_of_data *dn, fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); SetPageUptodate(page); - set_page_dirty(page); + if (set_page_dirty(page)) + dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) F2FS_I(dn->inode)->i_xattr_nid = dn->nid; @@ -1035,6 +1041,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *apage; int err; + if (!nid) + return; + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + apage = find_get_page(NODE_MAPPING(sbi), nid); if (apage && PageUptodate(apage)) { f2fs_put_page(apage, 0); @@ -1050,51 +1060,38 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +/* + * readahead MAX_RA_NODE number of node pages. + */ +void ra_node_pages(struct page *parent, int start) { - struct page *page; - int err; -repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); - if (!page) - return ERR_PTR(-ENOMEM); + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; - err = read_node_page(page, READ_SYNC); - if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } else if (err != LOCKED_PAGE) { - lock_page(page); - } + blk_start_plug(&plug); - if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto repeat; + /* Then, try readahead for siblings of the desired node */ + end = start + MAX_RA_NODE; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); } - return page; + + blk_finish_plug(&plug); } -/* - * Return a locked page for the desired node page. - * And, readahead MAX_RA_NODE number of node pages. - */ -struct page *get_node_page_ra(struct page *parent, int start) +struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) { - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; struct page *page; - int err, i, end; - nid_t nid; + int err; - /* First, try getting the desired direct node. */ - nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) @@ -1108,46 +1105,53 @@ repeat: goto page_hit; } - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start + 1; i < end; i++) { - nid = get_nid(parent, i, false); - if (!nid) - continue; - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); + if (parent) + ra_node_pages(parent, start + 1); lock_page(page); + + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } page_hit: - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } + f2fs_bug_on(sbi, nid != nid_of_node(page)); return page; } +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +{ + return __get_node_page(sbi, nid, NULL, 0); +} + +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + void sync_inode_page(struct dnode_of_data *dn) { + int ret = 0; + if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - update_inode(dn->inode, dn->node_page); + ret = update_inode(dn->inode, dn->node_page); } else if (dn->inode_page) { if (!dn->inode_page_locked) lock_page(dn->inode_page); - update_inode(dn->inode, dn->inode_page); + ret = update_inode(dn->inode, dn->inode_page); if (!dn->inode_page_locked) unlock_page(dn->inode_page); } else { - update_inode_page(dn->inode); + ret = update_inode_page(dn->inode); } + dn->node_changed = ret ? true: false; } int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, @@ -1175,6 +1179,11 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (unlikely(f2fs_cp_error(sbi))) { + pagevec_release(&pvec); + return -EIO; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1349,7 +1358,7 @@ static int f2fs_write_node_page(struct page *page, up_read(&sbi->node_write); unlock_page(page); - if (wbc->for_reclaim) + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, NODE, WRITE); return 0; @@ -1440,13 +1449,10 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (build) { /* do not add allocated nids */ - down_read(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && - (!get_nat_flag(ne, IS_CHECKPOINTED) || + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; - up_read(&nm_i->nat_tree_lock); if (allocated) return 0; } @@ -1532,6 +1538,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi) ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); + down_read(&nm_i->nat_tree_lock); + while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1560,6 +1568,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -1582,8 +1591,6 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - struct node_info ni; - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1594,13 +1601,6 @@ retry: i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); - - /* check nid is allocated already */ - get_node_info(sbi, *nid, &ni); - if (ni.blk_addr != NULL_ADDR) { - alloc_nid_done(sbi, *nid); - goto retry; - } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1842,14 +1842,12 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) raw_ne = nat_in_journal(sum, i); - down_write(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (!ne) { ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } __set_nat_cache_dirty(nm_i, ne); - up_write(&nm_i->nat_tree_lock); } update_nats_in_cursum(sum, -i); mutex_unlock(&curseg->curseg_mutex); @@ -1883,7 +1881,6 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; - struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: @@ -1920,12 +1917,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_ne = &nat_blk->entries[nid - start_nid]; } raw_nat_from_node_info(raw_ne, &ne->ni); - - down_write(&NM_I(sbi)->nat_tree_lock); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - up_write(&NM_I(sbi)->nat_tree_lock); - if (nat_get_blkaddr(ne) == NULL_ADDR) add_free_nid(sbi, nid, false); } @@ -1937,9 +1930,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, set->entry_cnt); - down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1959,6 +1950,9 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; + + down_write(&nm_i->nat_tree_lock); + /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them @@ -1967,7 +1961,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); - down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; @@ -1976,12 +1969,13 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) __adjust_nat_entry_set(setvec[idx], &sets, MAX_NAT_JENTRIES(sum)); } - up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); + up_write(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e4fffd2d98c4..d4d1f636fe1c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -183,7 +183,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) block_addr = (pgoff_t)(nm_i->nat_blkaddr + (seg_off << sbi->log_blocks_per_seg << 1) + - (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -317,7 +317,7 @@ static inline bool IS_DNODE(struct page *node_page) return true; } -static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); @@ -327,7 +327,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - set_page_dirty(p); + return set_page_dirty(p); } static inline nid_t get_nid(struct page *p, int off, bool i) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index cbf74f47cce8..589b20b8677b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -168,6 +168,32 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } +static bool is_same_inode(struct inode *inode, struct page *ipage) +{ + struct f2fs_inode *ri = F2FS_INODE(ipage); + struct timespec disk; + + if (!IS_INODE(ipage)) + return true; + + disk.tv_sec = le64_to_cpu(ri->i_ctime); + disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + if (timespec_compare(&inode->i_ctime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_atime); + disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + if (timespec_compare(&inode->i_atime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_mtime); + disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + if (timespec_compare(&inode->i_mtime, &disk) > 0) + return false; + + return true; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -197,7 +223,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (!entry) { + if (entry) { + if (!is_same_inode(entry->inode, page)) + goto next; + } else { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) @@ -459,8 +488,7 @@ out: return err; } -static int recover_data(struct f2fs_sb_info *sbi, - struct list_head *head, int type) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; @@ -469,7 +497,7 @@ static int recover_data(struct f2fs_sb_info *sbi, block_t blkaddr; /* get node pages in the current segment */ - curseg = CURSEG_I(sbi, type); + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); while (1) { @@ -556,7 +584,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: @@ -595,7 +623,7 @@ out: .reason = CP_RECOVERY, }; mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); } else { mutex_unlock(&sbi->cp_mutex); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f77b3258454a..5904a411c86f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 @@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - - tmp = __reverse_ulong((unsigned char *)p); - tmp &= ~0UL >> offset; - - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG-1)) { + + while (1) { + if (*p == 0) + goto pass; + tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp &= (~0UL << (BITS_PER_LONG - size)); - if (!tmp) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffs(tmp); + return result; +found: + return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - - tmp = __reverse_ulong((unsigned char *)p); - tmp |= ~((~0UL << offset) >> offset); - - if (size < BITS_PER_LONG) - goto found_first; - if (tmp != ~0UL) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG - 1)) { + + while (1) { + if (*p == ~0UL) + goto pass; + tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; if (tmp != ~0UL) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp |= ~(~0UL << (BITS_PER_LONG - size)); - if (tmp == ~0UL) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffz(tmp); + return result; +found: + return result - size + __reverse_ffz(tmp); } void register_inmem_page(struct inode *inode, struct page *page) @@ -233,7 +213,7 @@ int commit_inmem_pages(struct inode *inode, bool abort) * inode becomes free by iget_locked in f2fs_iget. */ if (!abort) { - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); } @@ -257,6 +237,7 @@ int commit_inmem_pages(struct inode *inode, bool abort) submit_bio = true; } } else { + ClearPageUptodate(cur->page); trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); } set_page_private(cur->page, 0); @@ -281,8 +262,10 @@ int commit_inmem_pages(struct inode *inode, bool abort) * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ -void f2fs_balance_fs(struct f2fs_sb_info *sbi) +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { + if (!need) + return; /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. @@ -310,8 +293,12 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES) || - jiffies > sbi->cp_expires) + (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + if (test_opt(sbi, DATA_FLUSH)) + sync_dirty_inodes(sbi, FILE_INODE); f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); + } } static int issue_flush_thread(void *data) @@ -1134,6 +1121,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; struct cp_control cpc; + int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -1164,12 +1152,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) sbi->segs_per_sec) - 1, end_segno); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); - return 0; + return err; } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1749,13 +1737,13 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, if (le32_to_cpu(nid_in_journal(sum, i)) == val) return i; } - if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) + if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL)) return update_nats_in_cursum(sum, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(sum); i++) if (le32_to_cpu(segno_in_journal(sum, i)) == val) return i; - if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) + if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL)) return update_sits_in_cursum(sum, 1); } return -1; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index da0d8e0b55a5..93606f281bf9 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -32,7 +32,8 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) { - return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); + return atomic_read(&sbi->total_zombie_tree) + + atomic_read(&sbi->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a65e0132352..6134832baaaf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -67,6 +67,7 @@ enum { Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, + Opt_data_flush, Opt_err, }; @@ -91,6 +92,7 @@ static match_table_t f2fs_tokens = { {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, {Opt_err, NULL}, }; @@ -216,7 +218,8 @@ F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -235,6 +238,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), NULL, }; @@ -406,6 +410,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -432,6 +439,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); @@ -548,7 +556,7 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_dirty_inode(sbi); + release_ino_entry(sbi); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); @@ -566,13 +574,14 @@ static void f2fs_put_super(struct super_block *sb) wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; - brelse(sbi->raw_super_buf); + kfree(sbi->raw_super); kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; trace_f2fs_sync_fs(sb, sync); @@ -582,14 +591,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); - } else { - f2fs_balance_fs(sbi); } f2fs_trace_ios(NULL, 1); - return 0; + return err; } static int f2fs_freeze(struct super_block *sb) @@ -686,6 +693,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -898,7 +907,7 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_size(unsigned bits) +static loff_t max_file_blocks(void) { loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; @@ -914,10 +923,82 @@ static loff_t max_file_size(unsigned bits) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; - result <<= bits; return result; } +static inline bool sanity_check_area_boundary(struct super_block *sb, + struct f2fs_super_block *raw_super) +{ + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; + } + + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; + } + + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; + } + + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; + } + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_blkaddr + (segment_count_main << log_blocks_per_seg) != + segment0_blkaddr + (segment_count << log_blocks_per_seg)) { + f2fs_msg(sb, KERN_INFO, + "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)", + main_blkaddr, + segment0_blkaddr + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + return true; + } + + return false; +} + static int sanity_check_raw_super(struct super_block *sb, struct f2fs_super_block *raw_super) { @@ -947,6 +1028,14 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return 1; + } + /* Currently, support 512/1024/2048/4096 bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || @@ -965,6 +1054,23 @@ static int sanity_check_raw_super(struct super_block *sb, le32_to_cpu(raw_super->log_sectorsize)); return 1; } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_msg(sb, KERN_INFO, + "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return 1; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sb, raw_super)) + return 1; + return 0; } @@ -1018,7 +1124,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; - sbi->cp_interval = DEF_CP_INTERVAL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); @@ -1032,111 +1139,114 @@ static void init_sb_info(struct f2fs_sb_info *sbi) */ static int read_raw_super_block(struct super_block *sb, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, - int *recovery) + int *valid_super_block, int *recovery) { int block = 0; - struct buffer_head *buffer; - struct f2fs_super_block *super; + struct buffer_head *bh; + struct f2fs_super_block *super, *buf; int err = 0; + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; retry: - buffer = sb_bread(sb, block); - if (!buffer) { + bh = sb_bread(sb, block); + if (!bh) { *recovery = 1; f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { - err = -EIO; - goto out; - } + err = -EIO; + goto next; } - super = (struct f2fs_super_block *) - ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); + buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET); /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, super)) { - brelse(buffer); + if (sanity_check_raw_super(sb, buf)) { + brelse(bh); *recovery = 1; f2fs_msg(sb, KERN_ERR, "Can't find valid F2FS filesystem in %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { - err = -EINVAL; - goto out; - } + err = -EINVAL; + goto next; } if (!*raw_super) { - *raw_super_buf = buffer; + memcpy(super, buf, sizeof(*super)); + *valid_super_block = block; *raw_super = super; - } else { - /* already have a valid superblock */ - brelse(buffer); } + brelse(bh); +next: /* check the validity of the second superblock */ if (block == 0) { block++; goto retry; } -out: /* No valid superblock */ - if (!*raw_super) + if (!*raw_super) { + kfree(super); return err; + } return 0; } +static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block) +{ + struct f2fs_super_block *super = F2FS_RAW_SUPER(sbi); + struct buffer_head *bh; + int err; + + bh = sb_getblk(sbi->sb, block); + if (!bh) + return -EIO; + + lock_buffer(bh); + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + err = __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); + brelse(bh); + + return err; +} + int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { - struct buffer_head *sbh = sbi->raw_super_buf; - sector_t block = sbh->b_blocknr; int err; /* write back-up superblock first */ - sbh->b_blocknr = block ? 0 : 1; - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); - - sbh->b_blocknr = block; + err = __f2fs_commit_super(sbi, sbi->valid_super_block ? 0 : 1); /* if we are in recovery path, skip writing valid superblock */ if (recover || err) - goto out; + return err; /* write current valid superblock */ - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); -out: - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - return err; + return __f2fs_commit_super(sbi, sbi->valid_super_block); } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; - struct buffer_head *raw_super_buf; struct inode *root; long err; bool retry = true, need_fsck = false; char *options = NULL; - int recovery, i; + int recovery, i, valid_super_block; try_onemore: err = -EINVAL; raw_super = NULL; - raw_super_buf = NULL; + valid_super_block = -1; recovery = 0; /* allocate memory for f2fs-specific super block info */ @@ -1150,7 +1260,8 @@ try_onemore: goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); + err = read_raw_super_block(sb, &raw_super, &valid_super_block, + &recovery); if (err) goto free_sbi; @@ -1167,7 +1278,9 @@ try_onemore: if (err) goto free_options; - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + sbi->max_file_blocks = max_file_blocks(); + sb->s_maxbytes = sbi->max_file_blocks << + le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); @@ -1183,7 +1296,7 @@ try_onemore: /* init f2fs-specific super block info */ sbi->sb = sb; sbi->raw_super = raw_super; - sbi->raw_super_buf = raw_super_buf; + sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); @@ -1236,8 +1349,10 @@ try_onemore: le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->alloc_valid_block_count = 0; - INIT_LIST_HEAD(&sbi->dir_inode_list); - spin_lock_init(&sbi->dir_inode_lock); + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } init_extent_cache_info(sbi); @@ -1355,12 +1470,14 @@ try_onemore: f2fs_commit_super(sbi, true); } - sbi->cp_expires = round_jiffies_up(jiffies); - + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); return 0; free_kobj: kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); @@ -1387,7 +1504,7 @@ free_meta_inode: free_options: kfree(options); free_sb_buf: - brelse(raw_super_buf); + kfree(raw_super); free_sbi: kfree(sbi); @@ -1424,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info)); + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); if (!f2fs_inode_cachep) return -ENOMEM; return 0; @@ -1478,10 +1596,14 @@ static int __init init_f2fs_fs(void) err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - f2fs_create_root_stats(); + err = f2fs_create_root_stats(); + if (err) + goto free_filesystem; f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_filesystem: + unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_crypto: diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 036952a945fa..10f1e784fa23 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -571,7 +571,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); /* protect xattr_ver */ @@ -580,5 +580,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); + f2fs_update_time(sbi, REQ_TIME); return err; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 509411dd3698..6aece96df19f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -677,7 +677,7 @@ static int __init fat_init_inodecache(void) fat_inode_cachep = kmem_cache_create("fat_inode_cache", sizeof(struct msdos_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (fat_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/file.c b/fs/file.c index 1aed0add16a2..1fbc5c0555a9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size) * vmalloc() if the allocation size will be considered "large" by the VM. */ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); + void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | + __GFP_NOWARN | __GFP_NORETRY); if (data != NULL) return data; } - return vmalloc(size); + return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); } static void __free_fdtable(struct fdtable *fdt) @@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 023f6a1f23cd..6915c950e6e8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -677,9 +677,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, if (!wbc->wb) return; - rcu_read_lock(); id = mem_cgroup_css_from_page(page)->id; - rcu_read_unlock(); if (id == wbc->wb_id) { wbc->wb_bytes += bytes; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2913db2a5b99..4d69d5c0bedc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void) int err; fuse_inode_cachep = kmem_cache_create("fuse_inode", - sizeof(struct fuse_inode), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fuse_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f348cfb6b69a..437fd73e381e 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -13,6 +13,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/bio.h> #include <linux/posix_acl.h> +#include <linux/security.h> #include "gfs2.h" #include "incore.h" @@ -262,6 +263,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) if (ip) { set_bit(GIF_INVALID, &ip->i_flags); forget_all_cached_acls(&ip->i_inode); + security_inode_invalidate_secctx(&ip->i_inode); gfs2_dir_hash_inval(ip); } } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 1d709d496364..f99f8e94de3f 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -114,7 +114,8 @@ static int __init init_gfs2_fs(void) gfs2_inode_cachep = kmem_cache_create("gfs2_inode", sizeof(struct gfs2_inode), 0, SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD| + SLAB_ACCOUNT, gfs2_init_inode_once); if (!gfs2_inode_cachep) goto fail; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4574fdd3d421..1ca95c232bb5 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -483,8 +483,8 @@ static int __init init_hfs_fs(void) int err; hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", - sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN, - hfs_init_once); + sizeof(struct hfs_inode_info), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once); if (!hfs_inode_cachep) return -ENOMEM; err = register_filesystem(&hfs_fs_type); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7302d96ae8bf..5d54490a136d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void) int err; hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache", - HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN, + HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfsplus_init_once); if (!hfsplus_inode_cachep) return -ENOMEM; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index f49be23e78aa..cfaa18c7a337 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) { struct hostfs_inode_info *hi; - hi = kmalloc(sizeof(*hi), GFP_KERNEL); + hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT); if (hi == NULL) return NULL; hi->fd = -1; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a561591896bd..458cf463047b 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -261,7 +261,7 @@ static int init_inodecache(void) hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", sizeof(struct hpfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (hpfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d8f51ee8126b..8bbf7f3e2a27 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -4,11 +4,11 @@ * Nadia Yvette Chambers, 2002 * * Copyright (C) 2002 Linus Torvalds. + * License: GPL */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ @@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page) delete_from_page_cache(page); } +static void +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) +{ + struct vm_area_struct *vma; + + /* + * end == 0 indicates that the entire range after + * start should be unmapped. + */ + vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + unsigned long v_offset; + unsigned long v_end; + + /* + * Can the expression below overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ + if (vma->vm_pgoff < start) + v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; + else + v_offset = 0; + + if (!end) + v_end = vma->vm_end; + else { + v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + + vma->vm_start; + if (v_end > vma->vm_end) + v_end = vma->vm_end; + } + + unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, + NULL); + } +} /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. - + * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv @@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + bool rsv_on_error; u32 hash; /* @@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, mapping, next, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); - lock_page(page); - if (likely(!page_mapped(page))) { - bool rsv_on_error = !PagePrivate(page); - /* - * We must free the huge page and remove - * from page cache (remove_huge_page) BEFORE - * removing the region/reserve map - * (hugetlb_unreserve_pages). In rare out - * of memory conditions, removal of the - * region/reserve map could fail. Before - * free'ing the page, note PagePrivate which - * is used in case of error. - */ - remove_huge_page(page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages( - inode, next, - next + 1, 1))) - hugetlb_fix_reserve_counts( - inode, rsv_on_error); - } - } else { - /* - * If page is mapped, it was faulted in after - * being unmapped. It indicates a race between - * hole punch and page fault. Do nothing in - * this case. Getting here in a truncate - * operation is a bug. - */ + /* + * If page is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) now after taking + * the fault mutex. The mutex will prevent faults + * until we finish removing the page. + * + * This race can only happen in the hole punch case. + * Getting here in a truncate operation is a bug. + */ + if (unlikely(page_mapped(page))) { BUG_ON(truncate_op); + + i_mmap_lock_write(mapping); + hugetlb_vmdelete_list(&mapping->i_mmap, + next * pages_per_huge_page(h), + (next + 1) * pages_per_huge_page(h)); + i_mmap_unlock_write(mapping); + } + + lock_page(page); + /* + * We must free the huge page and remove from page + * cache (remove_huge_page) BEFORE removing the + * region/reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the + * region/reserve map could fail. Before free'ing + * the page, note PagePrivate which is used in case + * of error. + */ + rsv_on_error = !PagePrivate(page); + remove_huge_page(page); + freed++; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, + next, next + 1, 1))) + hugetlb_fix_reserve_counts(inode, + rsv_on_error); } unlock_page(page); @@ -452,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode) clear_inode(inode); } -static inline void -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) -{ - struct vm_area_struct *vma; - - /* - * end == 0 indicates that the entire range after - * start should be unmapped. - */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { - unsigned long v_offset; - - /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. - */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; - - if (end) { - end = ((end - start) << PAGE_SHIFT) + - vma->vm_start + v_offset; - if (end > vma->vm_end) - end = vma->vm_end; - } else - end = vma->vm_end; - - unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL); - } -} - static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; @@ -708,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, /* * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an - * annotation because huge_pmd_share() does an allocation under + * annotation because huge_pmd_share() does an allocation under hugetlb's * i_mmap_rwsem. */ static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; @@ -738,7 +747,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * The policy is initialized here even if we are creating a * private inode because initialization simply creates an - * an empty rb tree and calls spin_lock_init(), later when we + * an empty rb tree and calls rwlock_init(), later when we * call mpol_free_shared_policy() it will just return because * the rb tree will still be empty. */ @@ -1202,7 +1211,6 @@ static struct file_system_type hugetlbfs_fs_type = { .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; -MODULE_ALIAS_FS("hugetlbfs"); static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1322,7 +1330,7 @@ static int __init init_hugetlbfs_fs(void) error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), - 0, 0, init_once); + 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) goto out2; @@ -1356,26 +1364,4 @@ static int __init init_hugetlbfs_fs(void) out2: return error; } - -static void __exit exit_hugetlbfs_fs(void) -{ - struct hstate *h; - int i; - - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(hugetlbfs_inode_cachep); - i = 0; - for_each_hstate(h) - kern_unmount(hugetlbfs_vfsmount[i++]); - unregister_filesystem(&hugetlbfs_fs_type); -} - -module_init(init_hugetlbfs_fs) -module_exit(exit_hugetlbfs_fs) - -MODULE_LICENSE("GPL"); +fs_initcall(init_hugetlbfs_fs) diff --git a/fs/inode.c b/fs/inode.c index 4230f66b7410..e491e54d2430 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1883,7 +1883,7 @@ void __init inode_init(void) sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 61abdc4920da..bcd2d41b318a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -94,7 +94,7 @@ static int __init init_inodecache(void) isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (isofs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d86c5e3176a1..bb080c272149 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void) jffs2_inode_cachep = kmem_cache_create("jffs2_i", sizeof(struct jffs2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), jffs2_i_init_once); if (!jffs2_inode_cachep) { pr_err("error: Failed to initialise inode cache\n"); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index f3a4857ff071..5a3da3f52908 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1153,7 +1153,7 @@ static struct jffs2_sb_info *work_to_sb(struct work_struct *work) { struct delayed_work *dwork; - dwork = container_of(work, struct delayed_work, work); + dwork = to_delayed_work(work); return container_of(dwork, struct jffs2_sb_info, wbuf_dwork); } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 8f9176caf098..900925b5eb8c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -898,7 +898,7 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 742bf4a230e8..821973853340 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - /* - * If the ino of the sysfs entry created for a kmem cache gets - * allocated from an ida layer, which is accounted to the memcg that - * owns the cache, the memcg will get pinned forever. So do not account - * ino ida allocations. - */ - ret = ida_simple_get(&root->ino_ida, 1, 0, - GFP_KERNEL | __GFP_NOACCOUNT); + ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); if (ret < 0) goto err_out2; kn->ino = ret; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 5f31ebd96c06..154a107cd376 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -25,13 +25,17 @@ #include <linux/mutex.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/inetdevice.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/svc_xprt.h> #include <net/ip.h> +#include <net/addrconf.h> +#include <net/ipv6.h> #include <linux/lockd/lockd.h> #include <linux/nfs.h> @@ -44,7 +48,7 @@ static struct svc_program nlmsvc_program; -struct nlmsvc_binding * nlmsvc_ops; +const struct nlmsvc_binding *nlmsvc_ops; EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); @@ -90,8 +94,7 @@ static unsigned long get_lockd_grace_period(void) static void grace_ender(struct work_struct *grace) { - struct delayed_work *dwork = container_of(grace, struct delayed_work, - work); + struct delayed_work *dwork = to_delayed_work(grace); struct lockd_net *ln = container_of(dwork, struct lockd_net, grace_period_end); @@ -279,6 +282,68 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) } } +static int lockd_inetaddr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct sockaddr_in sin; + + if (event != NETDEV_DOWN) + goto out; + + if (nlmsvc_rqst) { + dprintk("lockd_inetaddr_event: removed %pI4\n", + &ifa->ifa_local); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ifa->ifa_local; + svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, + (struct sockaddr *)&sin); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block lockd_inetaddr_notifier = { + .notifier_call = lockd_inetaddr_event, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int lockd_inet6addr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + struct sockaddr_in6 sin6; + + if (event != NETDEV_DOWN) + goto out; + + if (nlmsvc_rqst) { + dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ifa->addr; + svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, + (struct sockaddr *)&sin6); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block lockd_inet6addr_notifier = { + .notifier_call = lockd_inet6addr_event, +}; +#endif + +static void lockd_svc_exit_thread(void) +{ + unregister_inetaddr_notifier(&lockd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&lockd_inet6addr_notifier); +#endif + svc_exit_thread(nlmsvc_rqst); +} + static int lockd_start_svc(struct svc_serv *serv) { int error; @@ -315,7 +380,7 @@ static int lockd_start_svc(struct svc_serv *serv) return 0; out_task: - svc_exit_thread(nlmsvc_rqst); + lockd_svc_exit_thread(); nlmsvc_task = NULL; out_rqst: nlmsvc_rqst = NULL; @@ -360,6 +425,10 @@ static struct svc_serv *lockd_create_svc(void) printk(KERN_WARNING "lockd_up: create service failed\n"); return ERR_PTR(-ENOMEM); } + register_inetaddr_notifier(&lockd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + register_inet6addr_notifier(&lockd_inet6addr_notifier); +#endif dprintk("lockd_up: service created\n"); return serv; } @@ -428,7 +497,7 @@ lockd_down(struct net *net) } kthread_stop(nlmsvc_task); dprintk("lockd_down: service stopped\n"); - svc_exit_thread(nlmsvc_rqst); + lockd_svc_exit_thread(); dprintk("lockd_down: service destroyed\n"); nlmsvc_task = NULL; nlmsvc_rqst = NULL; diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig index 09ed066c0221..2b4503163930 100644 --- a/fs/logfs/Kconfig +++ b/fs/logfs/Kconfig @@ -1,6 +1,6 @@ config LOGFS tristate "LogFS file system" - depends on (MTD || BLOCK) + depends on MTD || (!MTD && BLOCK) select ZLIB_INFLATE select ZLIB_DEFLATE select CRC32 diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 0fce46d62b9c..db9cfc598883 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -409,7 +409,8 @@ const struct super_operations logfs_super_operations = { int logfs_init_inode_cache(void) { logfs_inode_cache = kmem_cache_create("logfs_inode_cache", - sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + sizeof(struct logfs_inode), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, logfs_init_once); if (!logfs_inode_cache) return -ENOMEM; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index cb1789ca1ee6..f975d667c539 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -91,7 +91,7 @@ static int __init init_inodecache(void) minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (minix_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index ce1eb3f9dfe8..1af15fcbe57b 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -82,7 +82,7 @@ static int init_inodecache(void) ncp_inode_cachep = kmem_cache_create("ncp_inode_cache", sizeof(struct ncp_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ncp_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 807eb6ef4f91..f0939d097406 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -83,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, res = htonl(NFS4ERR_BADHANDLE); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) + if (inode == NULL) { + trace_nfs4_cb_recall(cps->clp, &args->fh, NULL, + &args->stateid, -ntohl(res)); goto out; + } /* Set up a helper thread to actually return the delegation */ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { case 0: @@ -96,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, default: res = htonl(NFS4ERR_RESOURCE); } - trace_nfs4_recall_delegation(inode, -ntohl(res)); + trace_nfs4_cb_recall(cps->clp, &args->fh, inode, + &args->stateid, -ntohl(res)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); @@ -160,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, return lo; } +/* + * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) + */ +static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new) +{ + u32 oldseq, newseq; + + oldseq = be32_to_cpu(lo->plh_stateid.seqid); + newseq = be32_to_cpu(new->seqid); + + if (newseq > oldseq + 1) + return false; + return true; +} + static u32 initiate_file_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { @@ -169,34 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); - if (!lo) + if (!lo) { + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, + &args->cbl_stateid, -rv); goto out; + } ino = lo->plh_inode; spin_lock(&ino->i_lock); + if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { + rv = NFS4ERR_DELAY; + goto unlock; + } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); spin_unlock(&ino->i_lock); pnfs_layoutcommit_inode(ino, false); spin_lock(&ino->i_lock); - if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, - &args->cbl_range)) { + /* + * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) + */ + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { rv = NFS4ERR_DELAY; goto unlock; } + if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, + &args->cbl_range)) { + rv = NFS4_OK; + goto unlock; + } + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); } + pnfs_mark_layout_returned_if_empty(lo); unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(ino, 0); pnfs_put_layout_hdr(lo); - trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv); + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, + &args->cbl_stateid, -rv); iput(ino); out: return rv; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ce5a21861074..c82a21228a34 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1894,15 +1894,14 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_USER); if (!page) return -ENOMEM; - kaddr = kmap_atomic(page); + kaddr = page_address(page); memcpy(kaddr, symname, pathlen); if (pathlen < PAGE_SIZE) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); - kunmap_atomic(kaddr); trace_nfs_symlink_enter(dir, dentry); error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); @@ -2432,6 +2431,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) } EXPORT_SYMBOL_GPL(nfs_may_open); +static int nfs_execute_ok(struct inode *inode, int mask) +{ + struct nfs_server *server = NFS_SERVER(inode); + int ret; + + if (mask & MAY_NOT_BLOCK) + ret = nfs_revalidate_inode_rcu(server, inode); + else + ret = nfs_revalidate_inode(server, inode); + if (ret == 0 && !execute_ok(inode)) + ret = -EACCES; + return ret; +} + int nfs_permission(struct inode *inode, int mask) { struct rpc_cred *cred; @@ -2449,6 +2462,9 @@ int nfs_permission(struct inode *inode, int mask) case S_IFLNK: goto out; case S_IFREG: + if ((mask & MAY_OPEN) && + nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)) + return 0; break; case S_IFDIR: /* @@ -2481,8 +2497,8 @@ force_lookup: res = PTR_ERR(cred); } out: - if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) - res = -EACCES; + if (!res && (mask & MAY_EXEC)) + res = nfs_execute_ok(inode, mask); dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4b1d08f56aba..7ab7ec9f4eed 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } -void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq) -{ - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; -} -EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes); - static void nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) { @@ -670,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) req = nfs_list_entry(reqs.next); nfs_direct_setup_mirroring(dreq, &desc, req); + if (desc.pg_error < 0) { + list_splice_init(&reqs, &failed); + goto out_failed; + } list_for_each_entry_safe(req, tmp, &reqs, wb_list) { if (!nfs_pageio_add_request(&desc, req)) { @@ -677,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_list_add_request(req, &failed); spin_lock(cinfo.lock); dreq->flags = 0; - dreq->error = -EIO; + if (desc.pg_error < 0) + dreq->error = desc.pg_error; + else + dreq->error = -EIO; spin_unlock(cinfo.lock); } nfs_release_request(req); } nfs_pageio_complete(&desc); +out_failed: while (!list_empty(&failed)) { req = nfs_list_entry(failed.next); nfs_list_remove_request(req); @@ -727,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) nfs_direct_write_complete(dreq, data->inode); } -static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) +static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) { - /* There is no lock to clear */ + struct nfs_direct_req *dreq = cinfo->dreq; + + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_mark_request_commit(req, NULL, cinfo, 0); } static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { .completion = nfs_direct_commit_complete, - .error_cleanup = nfs_direct_error_cleanup, + .resched_write = nfs_direct_resched_write, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) @@ -839,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head) } } +static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + struct nfs_direct_req *dreq = hdr->dreq; + + spin_lock(&dreq->lock); + if (dreq->error == 0) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + /* fake unstable write to let common nfs resend pages */ + hdr->verf.committed = NFS_UNSTABLE; + hdr->good_bytes = hdr->args.count; + } + spin_unlock(&dreq->lock); +} + static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { .error_cleanup = nfs_write_sync_pgio_error, .init_hdr = nfs_direct_pgio_init, .completion = nfs_direct_write_completion, + .reschedule_io = nfs_direct_write_reschedule_io, }; @@ -900,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, } nfs_direct_setup_mirroring(dreq, &desc, req); + if (desc.pg_error < 0) { + nfs_free_request(req); + result = desc.pg_error; + break; + } nfs_lock_request(req); req->wb_index = pos >> PAGE_SHIFT; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 93e236429c5d..4ef8f5addcad 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -514,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page, * so it will not block due to pages that will shortly be freeable. */ nfsi = NFS_I(mapping->host); - if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + if (atomic_read(&nfsi->commit_info.rpcs_out)) { *writeback = true; return; } @@ -545,7 +545,7 @@ static int nfs_launder_page(struct page *page) inode->i_ino, (long long)page_offset(page)); nfs_fscache_wait_on_page_write(nfsi, page); - return nfs_wb_page(inode, page); + return nfs_wb_launder_page(inode, page); } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -756,7 +756,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { - status = nfs_iocounter_wait(&l_ctx->io_count); + status = nfs_iocounter_wait(l_ctx); nfs_put_lock_context(l_ctx); if (status < 0) return status; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 02ec07973bc4..bb1f4e7a3270 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, task->tk_status); nfs4_mark_deviceid_unavailable(devid); pnfs_error_mark_layout_for_return(inode, lseg); + pnfs_set_lo_fail(lseg); rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: @@ -883,13 +884,19 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_read_mds(pgio); @@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } + /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) goto out_mds; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 03516c80855a..6594e9f903a0 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -145,7 +145,7 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, return false; for (i = 0; i < m1->fh_versions_cnt; i++) { bool found_fh = false; - for (j = 0; j < m2->fh_versions_cnt; i++) { + for (j = 0; j < m2->fh_versions_cnt; j++) { if (nfs_compare_fh(&m1->fh_versions[i], &m2->fh_versions[j]) == 0) { found_fh = true; @@ -505,9 +505,17 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, } p = xdr_inline_decode(&stream, 4); - if (p) - fls->flags = be32_to_cpup(p); + if (!p) + goto out_sort_mirrors; + fls->flags = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_sort_mirrors; + for (i=0; i < fls->mirror_array_cnt; i++) + fls->mirror_array[i]->report_interval = be32_to_cpup(p); +out_sort_mirrors: ff_layout_sort_mirrors(fls); rc = ff_layout_check_layout(lgr); if (rc) @@ -603,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, mirror->start_time = now; if (ktime_equal(mirror->last_report_time, notime)) mirror->last_report_time = now; - if (layoutstats_timer != 0) + if (mirror->report_interval != 0) + report_interval = (s64)mirror->report_interval * 1000LL; + else if (layoutstats_timer != 0) report_interval = (s64)layoutstats_timer * 1000LL; if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= report_interval) { @@ -785,13 +795,19 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; /* Use full layout for now */ - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) goto out_mds; @@ -825,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int i; int status; - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) goto out_mds; @@ -867,18 +889,25 @@ static unsigned int ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + goto out; + } + } if (pgio->pg_lseg) return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ nfs_pageio_reset_write_mds(pgio); +out: return 1; } @@ -912,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) hdr->args.count, (unsigned long long)hdr->args.offset); - if (!hdr->dreq) { - struct nfs_open_context *ctx; - - ctx = nfs_list_entry(hdr->pages.next)->wb_context; - set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); - hdr->completion_ops->error_cleanup(&hdr->pages); - } else { - nfs_direct_set_resched_writes(hdr->dreq); - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.count; - } + hdr->completion_ops->reschedule_io(hdr); return; } @@ -1101,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, return -NFS4ERR_RESET_TO_PNFS; out_retry: task->tk_status = 0; - rpc_restart_call(task); + rpc_restart_call_prepare(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); return -EAGAIN; } @@ -1159,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, } } + switch (status) { + case NFS4ERR_DELAY: + case NFS4ERR_GRACE: + return; + default: + break; + } + mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, @@ -1242,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) return ff_layout_test_devid_unavailable(node); } -static int ff_layout_read_prepare_common(struct rpc_task *task, - struct nfs_pgio_header *hdr) +static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; nfs4_ff_layout_stat_io_start_read(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), hdr->args.count, task->tk_start); +} + +static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; + nfs4_ff_layout_stat_io_end_read(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, + hdr->res.count); +} +static int ff_layout_read_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1265,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, } hdr->pgio_done_cb = ff_layout_read_done_cb; + ff_layout_read_record_layoutstats_start(task, hdr); return 0; } @@ -1323,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data) dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); - nfs4_ff_layout_stat_io_end_read(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1341,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + ff_layout_read_record_layoutstats_done(task, hdr); rpc_count_iostats_metrics(task, &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]); } +static void ff_layout_read_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + ff_layout_read_record_layoutstats_done(&hdr->task, hdr); + pnfs_generic_rw_release(data); +} + + static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { @@ -1362,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_PNFS: - pnfs_set_retry_layoutget(hdr->lseg->pls_layout); ff_layout_reset_write(hdr, true); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); ff_layout_reset_write(hdr, false); return task->tk_status; case -EAGAIN: - rpc_restart_call_prepare(task); return -EAGAIN; } @@ -1402,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_PNFS: - pnfs_set_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -NFS4ERR_RESET_TO_MDS: - pnfs_clear_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -EAGAIN: @@ -1421,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return 0; } -static int ff_layout_write_prepare_common(struct rpc_task *task, - struct nfs_pgio_header *hdr) +static void ff_layout_write_record_layoutstats_start(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; nfs4_ff_layout_stat_io_start_write(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), hdr->args.count, task->tk_start); +} + +static void ff_layout_write_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count, + hdr->res.verf->committed); +} +static int ff_layout_write_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1445,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EAGAIN; } + ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1480,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count, - hdr->res.verf->committed); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1499,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + ff_layout_write_record_layoutstats_done(task, hdr); rpc_count_iostats_metrics(task, &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); } -static void ff_layout_commit_prepare_common(struct rpc_task *task, +static void ff_layout_write_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + ff_layout_write_record_layoutstats_done(&hdr->task, hdr); + pnfs_generic_rw_release(data); +} + +static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task, struct nfs_commit_data *cdata) { + if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags)) + return; nfs4_ff_layout_stat_io_start_write(cdata->inode, FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), 0, task->tk_start); } +static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + struct nfs_page *req; + __u64 count = 0; + + if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags)) + return; + + if (task->tk_status == 0) { + list_for_each_entry(req, &cdata->pages, wb_list) + count += req->wb_bytes; + } + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + count, count, NFS_FILE_SYNC); +} + +static void ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + ff_layout_commit_record_layoutstats_start(task, cdata); +} + static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { ff_layout_commit_prepare_common(task, data); @@ -1531,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) static void ff_layout_commit_done(struct rpc_task *task, void *data) { - struct nfs_commit_data *cdata = data; - struct nfs_page *req; - __u64 count = 0; - - if (task->tk_status == 0) { - list_for_each_entry(req, &cdata->pages, wb_list) - count += req->wb_bytes; - } - - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), - count, count, NFS_FILE_SYNC); - pnfs_generic_write_commit_done(task, data); } @@ -1551,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) { struct nfs_commit_data *cdata = data; + ff_layout_commit_record_layoutstats_done(task, cdata); rpc_count_iostats_metrics(task, &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]); } +static void ff_layout_commit_release(void *data) +{ + struct nfs_commit_data *cdata = data; + + ff_layout_commit_record_layoutstats_done(&cdata->task, cdata); + pnfs_generic_commit_release(data); +} + static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_read_release, }; static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_read_release, }; static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_write_release, }; static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_write_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { .rpc_call_prepare = ff_layout_commit_prepare_v3, .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, - .rpc_release = pnfs_generic_commit_release, + .rpc_release = ff_layout_commit_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { .rpc_call_prepare = ff_layout_commit_prepare_v4, .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, - .rpc_release = pnfs_generic_commit_release, + .rpc_release = ff_layout_commit_release, }; static enum pnfs_try_status diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 2bb08bc6aaf0..dd353bb7dc0a 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -85,6 +85,7 @@ struct nfs4_ff_layout_mirror { struct nfs4_ff_layoutstat write_stat; ktime_t start_time; ktime_t last_report_time; + u32 report_interval; }; struct nfs4_ff_layout_segment { diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e125e55de86d..bd0327541366 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -429,22 +429,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); - if (fail_return) { - pnfs_error_mark_layout_for_return(ino, lseg); - if (ff_layout_has_available_ds(lseg)) - pnfs_set_retry_layoutget(lseg->pls_layout); - else - pnfs_clear_retry_layoutget(lseg->pls_layout); - - } else { + if (!fail_return) { if (ff_layout_has_available_ds(lseg)) set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lseg->pls_layout->plh_flags); - else { + else pnfs_error_mark_layout_for_return(ino, lseg); - pnfs_clear_retry_layoutget(lseg->pls_layout); - } - } + } else + pnfs_error_mark_layout_for_return(ino, lseg); } out_update_creds: if (ff_layout_update_mirror_cred(mirror, ds)) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bdb4dc7b4ecd..8e24d886d2c5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -/** - * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks - * @word: long word containing the bit lock - */ -int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) +static int nfs_wait_killable(int mode) { freezable_schedule_unsafe(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } + +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) +{ + return nfs_wait_killable(mode); +} EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); +int nfs_wait_atomic_killable(atomic_t *p) +{ + return nfs_wait_killable(TASK_KILLABLE); +} + /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid @@ -700,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) l_ctx->lockowner.l_owner = current->files; l_ctx->lockowner.l_pid = current->tgid; INIT_LIST_HEAD(&l_ctx->list); - nfs_iocounter_init(&l_ctx->io_count); + atomic_set(&l_ctx->io_count, 0); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) @@ -913,6 +919,12 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx) { struct inode *inode = d_inode(ctx->dentry); + /* + * We fatal error on write before. Try to writeback + * every page again. + */ + if (ctx->error < 0) + invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); @@ -1663,6 +1675,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; + bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1724,22 +1737,28 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; } - } else + } else { nfsi->cache_validity |= save_cache_validity; + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_MTIME) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - } else if (server->caps & NFS_CAP_MTIME) + } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_CTIME) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - } else if (server->caps & NFS_CAP_CTIME) + } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1759,19 +1778,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) (long long)cur_isize, (long long)new_isize); } - } else + } else { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - else if (server->caps & NFS_CAP_ATIME) + else if (server->caps & NFS_CAP_ATIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { @@ -1780,36 +1803,42 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } - } else if (server->caps & NFS_CAP_MODE) + } else if (server->caps & NFS_CAP_MODE) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } - } else if (server->caps & NFS_CAP_OWNER) + } else if (server->caps & NFS_CAP_OWNER) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } - } else if (server->caps & NFS_CAP_OWNER_GROUP) + } else if (server->caps & NFS_CAP_OWNER_GROUP) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { @@ -1818,19 +1847,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); } - } else if (server->caps & NFS_CAP_NLINK) + } else if (server->caps & NFS_CAP_NLINK) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } - if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; + else + cache_revalidated = false; /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { @@ -1840,9 +1872,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { - if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { - if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) - nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); + if (cache_revalidated) { + if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, + nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { + nfsi->attrtimeo <<= 1; + if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode)) + nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); + } nfsi->attrtimeo_timestamp = now; } /* Set the barrier to be more recent than this fattr */ @@ -1851,7 +1887,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } /* Don't declare attrcache up to date if there were no attrs! */ - if (fattr->valid != 0) + if (cache_revalidated) invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ @@ -1933,7 +1969,7 @@ static int __init nfs_init_inodecache(void) nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9dea85f7f918..4e8cc942336c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -238,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); -int nfs_iocounter_wait(struct nfs_io_counter *c); +int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); @@ -252,18 +252,18 @@ void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); -static inline void nfs_iocounter_init(struct nfs_io_counter *c) -{ - c->flags = 0; - atomic_set(&c->io_count, 0); -} - static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) { WARN_ON_ONCE(desc->pg_mirror_count < 1); return desc->pg_mirror_count > 1; } +static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, + const struct nfs_open_context *ctx2) +{ + return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; +} + /* nfs2xdr.c */ extern struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, @@ -380,6 +380,7 @@ extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); +extern int nfs_wait_atomic_killable(atomic_t *p); /* super.c */ extern const struct super_operations nfs_sops; @@ -519,7 +520,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode) inode_dio_wait(inode); } extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); -extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); @@ -696,9 +696,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); } +static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) +{ + return ~crc32_le(0xFFFFFFFF, &stateid->other[0], + NFS4_STATEID_OTHER_SIZE); +} #else static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return 0; } +static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) +{ + return 0; +} #endif + +static inline bool nfs_error_is_fatal(int err) +{ + switch (err) { + case -ERESTARTSYS: + case -EIO: + case -ENOSPC: + case -EROFS: + case -E2BIG: + return true; + default: + return false; + } +} diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6b1ce9825430..6e8174930a48 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -204,6 +204,8 @@ static void nfs42_layoutstat_done(struct rpc_task *task, void *calldata) { struct nfs42_layoutstat_data *data = calldata; + struct inode *inode = data->inode; + struct pnfs_layout_hdr *lo; if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -211,12 +213,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo && nfs4_stateid_match(&data->args.stateid, + &lo->plh_stateid)) { + LIST_HEAD(head); + + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + } else + spin_unlock(&inode->i_lock); + break; case -ENOTSUPP: case -EOPNOTSUPP: - NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS; + NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; default: - dprintk("%s server returns %d\n", __func__, task->tk_status); + break; } + + dprintk("%s server returns %d\n", __func__, task->tk_status); } static void diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c57d1332c1c8..4bfc33ad0563 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -208,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY, FATTR4_WORD2_MDSTHRESHOLD +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + | FATTR4_WORD2_SECURITY_LABEL +#endif }; static const u32 nfs4_open_noattr_bitmap[3] = { @@ -1385,6 +1388,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s * Protect the call to nfs4_state_set_mode_locked and * serialise the stateid update */ + spin_lock(&state->owner->so_lock); write_seqlock(&state->seqlock); if (deleg_stateid != NULL) { nfs4_stateid_copy(&state->stateid, deleg_stateid); @@ -1393,7 +1397,6 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s if (open_stateid != NULL) nfs_set_open_stateid_locked(state, open_stateid, fmode); write_sequnlock(&state->seqlock); - spin_lock(&state->owner->so_lock); update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); } @@ -1598,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) if (!data->rpc_done) { state = nfs4_try_open_cached(data); + trace_nfs4_cached_open(data->state); goto out; } @@ -2015,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) } return; unlock_no_action: + trace_nfs4_cached_open(data->state); rcu_read_unlock(); out_no_action: task->tk_action = NULL; @@ -2703,6 +2708,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status == 0 && state != NULL) renew_lease(server, timestamp); + trace_nfs4_setattr(inode, &arg.stateid, status); return status; } @@ -2719,7 +2725,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, int err; do { err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); - trace_nfs4_setattr(inode, err); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -5048,7 +5053,6 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, static int nfs4_init_nonuniform_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5076,7 +5080,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) return -ENOMEM; rcu_read_lock(); - result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", + scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", clp->cl_ipaddr, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); @@ -5089,7 +5093,6 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) static int nfs4_init_uniquifier_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5109,7 +5112,7 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + scnprintf(str, len, "Linux NFSv%u.%u %s/%s", clp->rpc_ops->version, clp->cl_minorversion, nfs4_client_id_uniquifier, clp->cl_rpcclient->cl_nodename); @@ -5120,7 +5123,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) static int nfs4_init_uniform_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5145,7 +5147,7 @@ nfs4_init_uniform_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - result = scnprintf(str, len, "Linux NFSv%u.%u %s", + scnprintf(str, len, "Linux NFSv%u.%u %s", clp->rpc_ops->version, clp->cl_minorversion, clp->cl_rpcclient->cl_nodename); clp->cl_owner_id = str; @@ -5384,6 +5386,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co if (data == NULL) return -ENOMEM; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + + nfs4_state_protect(server->nfs_client, + NFS_SP4_MACH_CRED_CLEANUP, + &task_setup_data.rpc_client, &msg); + data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; data->args.bitmask = server->cache_consistency_bitmask; @@ -5426,7 +5433,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 int err; do { err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); - trace_nfs4_delegreturn(inode, err); + trace_nfs4_delegreturn(inode, stateid, err); switch (err) { case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -5936,6 +5943,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f data->cancelled = 1; rpc_put_task(task); dprintk("%s: done, ret = %d!\n", __func__, ret); + trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret); return ret; } @@ -5952,7 +5960,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); - trace_nfs4_lock_reclaim(request, state, F_SETLK, err); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -5979,7 +5986,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); - trace_nfs4_lock_expired(request, state, F_SETLK, err); switch (err) { default: goto out; @@ -6087,7 +6093,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * do { err = _nfs4_proc_setlk(state, cmd, request); - trace_nfs4_set_lock(request, state, cmd, err); if (err == -NFS4ERR_DENIED) err = -EAGAIN; err = nfs4_handle_exception(NFS_SERVER(state->inode), @@ -6847,10 +6852,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = { }, .allow.u.words = { [0] = 1 << (OP_CLOSE) | + 1 << (OP_OPEN_DOWNGRADE) | 1 << (OP_LOCKU) | + 1 << (OP_DELEGRETURN) | 1 << (OP_COMMIT), [1] = 1 << (OP_SECINFO - 32) | 1 << (OP_SECINFO_NO_NAME - 32) | + 1 << (OP_LAYOUTRETURN - 32) | 1 << (OP_TEST_STATEID - 32) | 1 << (OP_FREE_STATEID - 32) | 1 << (OP_WRITE - 32) @@ -6915,11 +6923,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, } if (test_bit(OP_CLOSE, sp->allow.u.longs) && + test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) && + test_bit(OP_DELEGRETURN, sp->allow.u.longs) && test_bit(OP_LOCKU, sp->allow.u.longs)) { dfprintk(MOUNT, " cleanup mode enabled\n"); set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); } + if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) { + dfprintk(MOUNT, " pnfs cleanup mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, + &clp->cl_sp4_flags); + } + if (test_bit(OP_SECINFO, sp->allow.u.longs) && test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { dfprintk(MOUNT, " secinfo mode enabled\n"); @@ -7748,6 +7764,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); struct nfs4_session *session = nfs4_get_session(server); + int ret; dprintk("--> %s\n", __func__); /* Note the is a race here, where a CB_LAYOUTRECALL can come in @@ -7758,12 +7775,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) if (nfs41_setup_sequence(session, &lgp->args.seq_args, &lgp->res.seq_res, task)) return; - if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid, NFS_I(lgp->args.inode)->layout, &lgp->args.range, - lgp->args.ctx->state)) { - rpc_exit(task, NFS4_OK); - } + lgp->args.ctx->state); + if (ret < 0) + rpc_exit(task, ret); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) @@ -7783,6 +7800,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: goto out; + + /* + * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs + * on the file. set tk_status to -ENODATA to tell upper layer to + * retry go inband. + */ + case -NFS4ERR_LAYOUTUNAVAILABLE: + task->tk_status = -ENODATA; + goto out; /* * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). @@ -7979,6 +8005,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, + &lgp->res.stateid, status); /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) @@ -8035,11 +8062,11 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); + pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); - lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); @@ -8071,6 +8098,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) }; int status = 0; + nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client, + NFS_SP4_MACH_CRED_PNFS_CLEANUP, + &task_setup_data.rpc_client, &msg); + dprintk("--> %s\n", __func__); if (!sync) { lrp->inode = nfs_igrab_and_active(lrp->args.inode); @@ -8086,7 +8117,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) return PTR_ERR(task); if (sync) status = task->tk_status; - trace_nfs4_layoutreturn(lrp->args.inode, status); + trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status); dprintk("<-- %s status=%d\n", __func__, status); rpc_put_task(task); return status; @@ -8234,7 +8265,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) return PTR_ERR(task); if (sync) status = task->tk_status; - trace_nfs4_layoutcommit(data->args.inode, status); + trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status); dprintk("%s: status %d\n", __func__, status); rpc_put_task(task); return status; diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 0fbd3ab1be22..8693d77c45ea 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -12,7 +12,7 @@ #include "nfs4idmap.h" #include "callback.h" -static const int nfs_set_port_min = 0; +static const int nfs_set_port_min; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index d774335cc8bc..2850bce19244 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -6,6 +6,7 @@ #include "internal.h" #include "nfs4session.h" #include "callback.h" +#include "pnfs.h" #define CREATE_TRACE_POINTS #include "nfs4trace.h" diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 671cf68fe56b..2c8d05dae5b1 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done, __entry->highest_slotid = res->sr_highest_slotid; __entry->target_highest_slotid = res->sr_target_highest_slotid; + __entry->status_flags = res->sr_status_flags; __entry->error = res->sr_status; ), TP_printk( @@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __field(u64, fileid) __field(u64, dir) __string(name, ctx->dentry->d_name.name) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, openstateid_seq) + __field(u32, openstateid_hash) ), TP_fast_assign( @@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->flags = flags; __entry->fmode = (__force unsigned int)ctx->mode; __entry->dev = ctx->dentry->d_sb->s_dev; - if (!IS_ERR_OR_NULL(state)) + if (!IS_ERR_OR_NULL(state)) { inode = state->inode; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + __entry->openstateid_seq = + be32_to_cpu(state->open_stateid.seqid); + __entry->openstateid_hash = + nfs_stateid_hash(&state->open_stateid); + } else { + __entry->stateid_seq = 0; + __entry->stateid_hash = 0; + __entry->openstateid_seq = 0; + __entry->openstateid_hash = 0; + } if (inode != NULL) { __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); @@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event, TP_printk( "error=%d (%s) flags=%d (%s) fmode=%s " "fileid=%02x:%02x:%llu fhandle=0x%08x " - "name=%02x:%02x:%llu/%s", + "name=%02x:%02x:%llu/%s stateid=%d:0x%08x " + "openstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), __entry->flags, @@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fhandle, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->stateid_seq, __entry->stateid_hash, + __entry->openstateid_seq, __entry->openstateid_hash ) ); @@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim); DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired); DEFINE_NFS4_OPEN_EVENT(nfs4_open_file); +TRACE_EVENT(nfs4_cached_open, + TP_PROTO( + const struct nfs4_state *state + ), + TP_ARGS(state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)state->state; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + ), + + TP_printk( + "fmode=%s fileid=%02x:%02x:%llu " + "fhandle=0x%08x stateid=%d:0x%08x", + __entry->fmode ? show_fmode_flags(__entry->fmode) : + "closed", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash + ) +); + TRACE_EVENT(nfs4_close, TP_PROTO( const struct nfs4_state *state, @@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close, __field(u64, fileid) __field(unsigned int, fmode) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close, __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->fmode = (__force unsigned int)state->state; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(args->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&args->stateid); ), TP_printk( "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " - "fhandle=0x%08x", + "fhandle=0x%08x openstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), __entry->fmode ? show_fmode_flags(__entry->fmode) : "closed", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) cmd=%s:%s range=%lld:%lld " - "fileid=%02x:%02x:%llu fhandle=0x%08x", + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), show_lock_cmd(__entry->cmd), @@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, (long long)__entry->end, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, ), \ TP_ARGS(request, state, cmd, error)) DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock); -DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock); -DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim); -DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired); DEFINE_NFS4_LOCK_EVENT(nfs4_unlock); +TRACE_EVENT(nfs4_set_lock, + TP_PROTO( + const struct file_lock *request, + const struct nfs4_state *state, + const nfs4_stateid *lockstateid, + int cmd, + int error + ), + + TP_ARGS(request, state, lockstateid, cmd, error), + + TP_STRUCT__entry( + __field(int, error) + __field(int, cmd) + __field(char, type) + __field(loff_t, start) + __field(loff_t, end) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, lockstateid_seq) + __field(u32, lockstateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->cmd = cmd; + __entry->type = request->fl_type; + __entry->start = request->fl_start; + __entry->end = request->fl_end; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + __entry->lockstateid_seq = + be32_to_cpu(lockstateid->seqid); + __entry->lockstateid_hash = + nfs_stateid_hash(lockstateid); + ), + + TP_printk( + "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x lockstateid=%d:0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + show_lock_cmd(__entry->cmd), + show_lock_type(__entry->type), + (long long)__entry->start, + (long long)__entry->end, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __entry->lockstateid_seq, __entry->lockstateid_hash + ) +); + DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_PROTO( const struct inode *inode, @@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit, __field(dev_t, dev) __field(u32, fhandle) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( __entry->dev = res->server->s_dev; __entry->fhandle = nfs_fhandle_hash(args->fhandle); __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(args->stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(args->stateid); ), TP_printk( - "error=%d (%s) dev=%02x:%02x fhandle=0x%08x", + "error=%d (%s) dev=%02x:%02x fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event, ), \ TP_ARGS(inode, error)) -DEFINE_NFS4_INODE_EVENT(nfs4_setattr); DEFINE_NFS4_INODE_EVENT(nfs4_access); DEFINE_NFS4_INODE_EVENT(nfs4_readlink); DEFINE_NFS4_INODE_EVENT(nfs4_readdir); @@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl); DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label); DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label); #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ -DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation); -DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn); + +DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, + TP_PROTO( + const struct inode *inode, + const nfs4_stateid *stateid, + int error + ), + + TP_ARGS(inode, stateid, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(stateid); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash + ) +); + +#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_stateid_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const nfs4_stateid *stateid, \ + int error \ + ), \ + TP_ARGS(inode, stateid, error)) + +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn); DECLARE_EVENT_CLASS(nfs4_getattr_event, TP_PROTO( @@ -941,8 +1145,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, ), \ TP_ARGS(clp, fhandle, inode, error)) DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr); -DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode); +DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs_fh *fhandle, + const struct inode *inode, + const nfs4_stateid *stateid, + int error + ), + + TP_ARGS(clp, fhandle, inode, stateid, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fhandle); + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + } else { + __entry->fileid = 0; + __entry->dev = 0; + } + __assign_str(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __entry->stateid_seq = + be32_to_cpu(stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(stateid); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __get_str(dstaddr) + ) +); + +#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + const struct nfs_fh *fhandle, \ + const struct inode *inode, \ + const nfs4_stateid *stateid, \ + int error \ + ), \ + TP_ARGS(clp, fhandle, inode, stateid, error)) +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall); +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file); DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( @@ -1005,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event, __field(loff_t, offset) __field(size_t, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs4_state *state = + hdr->args.context->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu", + "offset=%lld count=%zu stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->stateid_seq, __entry->stateid_hash ) ); #define DEFINE_NFS4_READ_EVENT(name) \ @@ -1056,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event, __field(loff_t, offset) __field(size_t, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs4_state *state = + hdr->args.context->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu", + "offset=%lld count=%zu stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -1154,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget, const struct nfs_open_context *ctx, const struct pnfs_layout_range *args, const struct pnfs_layout_range *res, + const nfs4_stateid *layout_stateid, int error ), - TP_ARGS(ctx, args, res, error), + TP_ARGS(ctx, args, res, layout_stateid, error), TP_STRUCT__entry( __field(dev_t, dev) @@ -1167,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget, __field(u64, offset) __field(u64, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( const struct inode *inode = d_inode(ctx->dentry); + const struct nfs4_state *state = ctx->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); @@ -1178,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget, __entry->offset = args->offset; __entry->count = args->length; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + if (!error) { + __entry->layoutstateid_seq = + be32_to_cpu(layout_stateid->seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(layout_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "iomode=%s offset=%llu count=%llu", + "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x " + "layoutstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1190,14 +1498,83 @@ TRACE_EVENT(nfs4_layoutget, __entry->fhandle, show_pnfs_iomode(__entry->iomode), (unsigned long long)__entry->offset, - (unsigned long long)__entry->count + (unsigned long long)__entry->count, + __entry->stateid_seq, __entry->stateid_hash, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn); DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); +#define show_pnfs_update_layout_reason(reason) \ + __print_symbolic(reason, \ + { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \ + { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \ + { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \ + { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \ + { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \ + { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \ + { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \ + { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ + { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ + { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ + { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) + +TRACE_EVENT(pnfs_update_layout, + TP_PROTO(struct inode *inode, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + struct pnfs_layout_hdr *lo, + enum pnfs_update_layout_reason reason + ), + TP_ARGS(inode, pos, count, iomode, lo, reason), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, pos) + __field(u64, count) + __field(enum pnfs_iomode, iomode) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) + __field(enum pnfs_update_layout_reason, reason) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->pos = pos; + __entry->count = count; + __entry->iomode = iomode; + __entry->reason = reason; + if (lo != NULL) { + __entry->layoutstateid_seq = + be32_to_cpu(lo->plh_stateid.seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(&lo->plh_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } + ), + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s pos=%llu count=%llu " + "layoutstateid=%d:0x%08x (%s)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_iomode(__entry->iomode), + (unsigned long long)__entry->pos, + (unsigned long long)__entry->count, + __entry->layoutstateid_seq, __entry->layoutstateid_hash, + show_pnfs_update_layout_reason(__entry->reason) + ) +); + #endif /* CONFIG_NFS_V4_1 */ #endif /* _TRACE_NFS4_H */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 59f838cdc009..9f80a086b612 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -39,7 +39,6 @@ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ - { 1 << NFS_INO_COMMIT, "COMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 452a011ba0d8..8ce4f61cbaa5 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p) kmem_cache_free(nfs_page_cachep, p); } -static void -nfs_iocounter_inc(struct nfs_io_counter *c) -{ - atomic_inc(&c->io_count); -} - -static void -nfs_iocounter_dec(struct nfs_io_counter *c) -{ - if (atomic_dec_and_test(&c->io_count)) { - clear_bit(NFS_IO_INPROGRESS, &c->flags); - smp_mb__after_atomic(); - wake_up_bit(&c->flags, NFS_IO_INPROGRESS); - } -} - -static int -__nfs_iocounter_wait(struct nfs_io_counter *c) -{ - wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); - DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); - int ret = 0; - - do { - prepare_to_wait(wq, &q.wait, TASK_KILLABLE); - set_bit(NFS_IO_INPROGRESS, &c->flags); - if (atomic_read(&c->io_count) == 0) - break; - ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE); - } while (atomic_read(&c->io_count) != 0 && !ret); - finish_wait(wq, &q.wait); - return ret; -} - /** * nfs_iocounter_wait - wait for i/o to complete - * @c: nfs_io_counter to use + * @l_ctx: nfs_lock_context with io_counter to use * * returns -ERESTARTSYS if interrupted by a fatal signal. * Otherwise returns 0 once the io_count hits 0. */ int -nfs_iocounter_wait(struct nfs_io_counter *c) +nfs_iocounter_wait(struct nfs_lock_context *l_ctx) { - if (atomic_read(&c->io_count) == 0) - return 0; - return __nfs_iocounter_wait(c); + return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable, + TASK_KILLABLE); } /* @@ -370,7 +335,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, return ERR_CAST(l_ctx); } req->wb_lock_context = l_ctx; - nfs_iocounter_inc(&l_ctx->io_count); + atomic_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in @@ -431,7 +396,8 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { - nfs_iocounter_dec(&l_ctx->io_count); + if (atomic_dec_and_test(&l_ctx->io_count)) + wake_up_atomic_t(&l_ctx->io_count); nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } @@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); * @desc: IO descriptor * @hdr: pageio header */ -static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) +static void nfs_pgio_error(struct nfs_pgio_header *hdr) { - struct nfs_pgio_mirror *mirror; - u32 midx; - set_bit(NFS_IOHDR_REDO, &hdr->flags); nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); - /* TODO: Make sure it's right to clean up all mirrors here - * and not just hdr->pgio_mirror_idx */ - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - } - return -ENOMEM; } /** @@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, unsigned int pagecount, pageused; pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); - if (!nfs_pgarray_set(&hdr->page_array, pagecount)) - return nfs_pgio_error(desc, hdr); + if (!nfs_pgarray_set(&hdr->page_array, pagecount)) { + nfs_pgio_error(hdr); + desc->pg_error = -ENOMEM; + return desc->pg_error; + } nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); pages = hdr->page_array.pagevec; @@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, *pages++ = last_page = req->wb_page; } } - if (WARN_ON_ONCE(pageused != pagecount)) - return nfs_pgio_error(desc, hdr); + if (WARN_ON_ONCE(pageused != pagecount)) { + nfs_pgio_error(hdr); + desc->pg_error = -EINVAL; + return desc->pg_error; + } if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) @@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio); static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror; struct nfs_pgio_header *hdr; int ret; - mirror = nfs_pgio_current_mirror(desc); - hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - /* TODO: make sure this is right with mirroring - or - * should it back out all mirrors? */ - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); @@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + if (pgio->pg_error < 0) + return pgio->pg_error; + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) return -EINVAL; @@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) pgio->pg_mirrors_dynamic = NULL; } -static bool nfs_match_open_context(const struct nfs_open_context *ctx1, - const struct nfs_open_context *ctx2) -{ - return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; -} - static bool nfs_match_lock_context(const struct nfs_lock_context *l1, const struct nfs_lock_context *l2) { @@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, } else { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); + if (desc->pg_error < 0) + return 0; mirror->pg_base = req->wb_pgbase; } if (!nfs_can_coalesce_requests(prev, req, desc)) @@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, bytes = req->wb_bytes; nfs_pageio_setup_mirroring(desc, req); + if (desc->pg_error < 0) + goto out_failed; for (midx = 0; midx < desc->pg_mirror_count; midx++) { if (midx) { @@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (IS_ERR(dupreq)) { nfs_page_group_unlock(req); - return 0; + desc->pg_error = PTR_ERR(dupreq); + goto out_failed; } nfs_lock_request(dupreq); @@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (nfs_pgio_has_mirroring(desc)) desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) - return 0; + goto out_failed; } return 1; + +out_failed: + /* + * We might have failed before sending any reqs over wire. + * Clean up rest of the reqs in mirror pg_list. + */ + if (desc->pg_error) { + struct nfs_pgio_mirror *mirror; + void (*func)(struct list_head *); + + /* remember fatal errors */ + if (nfs_error_is_fatal(desc->pg_error)) + mapping_set_error(desc->pg_inode->i_mapping, + desc->pg_error); + + func = desc->pg_completion_ops->error_cleanup; + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + func(&mirror->pg_list); + } + } + return 0; } /* @@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, nfs_pageio_complete(desc); if (!list_empty(&failed)) { list_move(&failed, &hdr->pages); - return -EIO; + return desc->pg_error < 0 ? desc->pg_error : -EIO; } return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bec0384499f7..a3592cc34a20 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock); static LIST_HEAD(pnfs_modules_tbl); static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync); /* Return the registered pnfs layout driver module matching given id */ @@ -385,13 +385,13 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, enum pnfs_iomode iomode; bool send; - stateid = lo->plh_stateid; + nfs4_stateid_copy(&stateid, &lo->plh_stateid); iomode = lo->plh_return_iomode; send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, iomode, false); } } else spin_unlock(&inode->i_lock); @@ -566,10 +566,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range) + const struct pnfs_layout_range *recall_range) { struct pnfs_layout_segment *lseg, *next; - int invalid = 0, removed = 0; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -582,11 +582,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); - invalid++; - removed += mark_lseg_invalid(lseg, tmp_list); + if (!mark_lseg_invalid(lseg, tmp_list)) + remaining++; } - dprintk("%s:Return %i\n", __func__, invalid - removed); - return invalid - removed; + dprintk("%s:Return %i\n", __func__, remaining); + return remaining; } /* note free_me must contain lsegs from a single layout_hdr */ @@ -618,7 +618,6 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_get_layout_hdr(lo); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); - pnfs_clear_retry_layoutget(lo); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -703,6 +702,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, ret = -EAGAIN; spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&lseg_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); iput(inode); } @@ -826,7 +827,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state) { int status = 0; @@ -861,7 +862,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; @@ -894,7 +895,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.minlength = i_size - range->offset; } lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - lgp->args.range = *range; + pnfs_copy_range(&lgp->args.range, range); lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); @@ -904,17 +905,9 @@ send_layoutget(struct pnfs_layout_hdr *lo, lseg = nfs4_proc_layoutget(lgp, gfp_flags); } while (lseg == ERR_PTR(-EAGAIN)); - if (IS_ERR(lseg)) { - switch (PTR_ERR(lseg)) { - case -ENOMEM: - case -ERESTARTSYS: - break; - default: - /* remember that LAYOUTGET failed and suspend trying */ - pnfs_layout_io_set_failed(lo, range->iomode); - } - return NULL; - } else + if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg))) + lseg = NULL; + else pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(range->iomode)); @@ -945,7 +938,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) } static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync) { struct inode *ino = lo->plh_inode; @@ -962,7 +955,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, goto out; } - lrp->args.stateid = stateid; + nfs4_stateid_copy(&lrp->args.stateid, stateid); lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; lrp->args.inode = ino; lrp->args.range.iomode = iomode; @@ -1005,7 +998,7 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout to return\n", __func__); goto out; } - stateid = nfsi->layout->plh_stateid; + nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); @@ -1033,7 +1026,7 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); if (send) - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); out_put_layout_hdr: pnfs_put_layout_hdr(lo); out: @@ -1096,13 +1089,12 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - stateid = lo->plh_stateid; + nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags)) layoutreturn = pnfs_prepare_layoutreturn(lo); - pnfs_clear_retry_layoutget(lo); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) /* If we are sending layoutreturn, invalidate all valid lsegs */ if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { @@ -1124,7 +1116,7 @@ out_noroc: pnfs_free_lseg_list(&tmp_list); pnfs_layoutcommit_inode(ino, true); if (layoutreturn) - pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); return roc; } @@ -1149,6 +1141,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; + pnfs_mark_layout_returned_if_empty(lo); if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); @@ -1465,25 +1458,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, return ret; } -/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ -static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode) -{ - if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) - return 1; - return nfs_wait_bit_killable(key, mode); -} - static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) { - if (!pnfs_should_retry_layoutget(lo)) - return false; /* * send layoutcommit as it can hold up layoutreturn due to lseg * reference */ pnfs_layoutcommit_inode(lo->plh_inode, false); return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, - pnfs_layoutget_retry_bit_wait, + nfs_wait_bit_killable, TASK_UNINTERRUPTIBLE); } @@ -1520,14 +1503,23 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; bool first; - if (!pnfs_enabled_sb(NFS_SERVER(ino))) + if (!pnfs_enabled_sb(NFS_SERVER(ino))) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_NO_PNFS); goto out; + } - if (iomode == IOMODE_READ && i_size_read(ino) == 0) + if (iomode == IOMODE_READ && i_size_read(ino) == 0) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_RD_ZEROLEN); goto out; + } - if (pnfs_within_mdsthreshold(ctx, ino, iomode)) + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_MDSTHRESH); goto out; + } lookup_again: first = false; @@ -1535,19 +1527,25 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_NOMEM); goto out; } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_BULK_RECALL); dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; } /* if LAYOUTGET already failed once we don't try again */ - if (pnfs_layout_io_test_failed(lo, iomode) && - !pnfs_should_retry_layoutget(lo)) + if (pnfs_layout_io_test_failed(lo, iomode)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); goto out_unlock; + } first = list_empty(&lo->plh_segs); if (first) { @@ -1567,8 +1565,11 @@ lookup_again: * already exists */ lseg = pnfs_find_lseg(lo, &arg); - if (lseg) + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_FOUND_CACHED); goto out_unlock; + } } /* @@ -1585,11 +1586,16 @@ lookup_again: dprintk("%s retrying\n", __func__); goto lookup_again; } + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_RETURN); goto out_put_layout_hdr; } - if (pnfs_layoutgets_blocked(lo)) + if (pnfs_layoutgets_blocked(lo)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_BLOCKED); goto out_unlock; + } atomic_inc(&lo->plh_outstanding); spin_unlock(&ino->i_lock); @@ -1612,8 +1618,9 @@ lookup_again: arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - pnfs_clear_retry_layoutget(lo); atomic_dec(&lo->plh_outstanding); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1623,7 +1630,7 @@ out: "(%s, offset: %llu, length: %llu)\n", __func__, ino->i_sb->s_id, (unsigned long long)NFS_FILEID(ino), - lseg == NULL ? "not found" : "found", + IS_ERR_OR_NULL(lseg) ? "not found" : "found", iomode==IOMODE_RW ? "read/write" : "read-only", (unsigned long long)pos, (unsigned long long)count); @@ -1730,16 +1737,29 @@ out_forget_reply: } static void +pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) +{ + if (lo->plh_return_iomode == iomode) + return; + if (lo->plh_return_iomode != 0) + iomode = IOMODE_ANY; + lo->plh_return_iomode = iomode; +} + +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *return_range) + const struct pnfs_layout_range *return_range) { struct pnfs_layout_segment *lseg, *next; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); if (list_empty(&lo->plh_segs)) - return; + return 0; + + assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (should_free_lseg(&lseg->pls_range, return_range)) { @@ -1749,38 +1769,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg->pls_range.offset, lseg->pls_range.length); set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); - mark_lseg_invalid(lseg, tmp_list); + pnfs_set_plh_return_iomode(lo, return_range->iomode); + if (!mark_lseg_invalid(lseg, tmp_list)) + remaining++; set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); } + return remaining; } void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg) { struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode); struct pnfs_layout_range range = { .iomode = lseg->pls_range.iomode, .offset = 0, .length = NFS4_MAX_UINT64, }; LIST_HEAD(free_me); + bool return_now = false; spin_lock(&inode->i_lock); - /* set failure bit so that pnfs path will be retried later */ - pnfs_layout_set_fail_bit(lo, iomode); - if (lo->plh_return_iomode == 0) - lo->plh_return_iomode = range.iomode; - else if (lo->plh_return_iomode != range.iomode) - lo->plh_return_iomode = IOMODE_ANY; + pnfs_set_plh_return_iomode(lo, range.iomode); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - pnfs_mark_matching_lsegs_return(lo, &free_me, &range); - spin_unlock(&inode->i_lock); + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode = lo->plh_return_iomode; + + nfs4_stateid_copy(&stateid, &lo->plh_stateid); + return_now = pnfs_prepare_layoutreturn(lo); + spin_unlock(&inode->i_lock); + if (return_now) + pnfs_send_layoutreturn(lo, &stateid, iomode, false); + } else { + spin_unlock(&inode->i_lock); + nfs_commit_inode(inode, 0); + } pnfs_free_lseg_list(&free_me); } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); @@ -1802,6 +1831,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r rd_size, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) @@ -1814,13 +1848,19 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - if (pgio->pg_lseg == NULL) + if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), wb_size, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_write_mds(pgio); @@ -1988,15 +2028,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); @@ -2119,15 +2157,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d1990e90e7a0..9f4e2a47f4aa 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -98,7 +98,6 @@ enum { NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ - NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */ }; enum layoutdriver_policy_flags { @@ -261,11 +260,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, bool update_barrier); int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range); +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + const struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -379,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d) return d; } -static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) - atomic_inc(&lo->plh_refcount); -} - -static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) { - atomic_dec(&lo->plh_refcount); - /* wake up waiters for LAYOUTRETURN as that is not needed */ - wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); - } -} - -static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags); -} - static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -409,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg) return lseg; } +static inline bool +pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg) +{ + return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0; +} + /* Return true if a layout driver is being used for this mountpoint */ static inline int pnfs_enabled_sb(struct nfs_server *nfss) { @@ -556,6 +544,26 @@ pnfs_calc_offset_length(u64 offset, u64 end) return 1 + end - offset; } +/** + * pnfs_mark_layout_returned_if_empty - marks the layout as returned + * @lo: layout header + * + * Note: Caller must hold inode->i_lock + */ +static inline void +pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) +{ + if (list_empty(&lo->plh_segs)) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); +} + +static inline void +pnfs_copy_range(struct pnfs_layout_range *dst, + const struct pnfs_layout_range *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 24655b807d44..81ac6480f9e7 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, } else { nfs_retry_commit(mds_pages, NULL, cinfo, 0); pnfs_generic_retry_commit(cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } } nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); - if (nreq == 0) { - cinfo->completion_ops->error_cleanup(NFS_I(inode)); + if (nreq == 0) goto out; - } atomic_add(nreq, &cinfo->mds->rpcs_out); @@ -871,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { + if (!pnfs_is_valid_lseg(lseg)) { + spin_unlock(cinfo->lock); + cinfo->completion_ops->resched_write(cinfo, req); + return; + } /* Non-empty buckets hold a reference on the lseg. That ref * is normally transferred to the COMMIT call and released * there. It could also be released if the last req is pulled diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 0a5e33f33b5c..eb31e23e7def 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); +static void nfs_readpage_release(struct nfs_page *req) +{ + struct inode *inode = d_inode(req->wb_context->dentry); + + dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), req->wb_bytes, + (long long)req_offset(req)); + + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(inode, req->wb_page, 0); + + unlock_page(req->wb_page); + } + nfs_release_request(req); +} + int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { @@ -106,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); - nfs_pageio_add_request(&pgio, new); + if (!nfs_pageio_add_request(&pgio, new)) { + nfs_list_remove_request(new); + nfs_readpage_release(new); + } nfs_pageio_complete(&pgio); /* It doesn't make sense to do mirrored reads! */ @@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, pgm = &pgio.pg_mirrors[0]; NFS_I(inode)->read_io += pgm->pg_bytes_written; - return 0; -} - -static void nfs_readpage_release(struct nfs_page *req) -{ - struct inode *inode = d_inode(req->wb_context->dentry); - - dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), req->wb_bytes, - (long long)req_offset(req)); - - if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(inode, req->wb_page, 0); - - unlock_page(req->wb_page); - } - nfs_release_request(req); + return pgio.pg_error < 0 ? pgio.pg_error : 0; } static void nfs_page_group_set_uptodate(struct nfs_page *req) @@ -361,6 +364,8 @@ readpage_async_filler(void *data, struct page *page) if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); if (!nfs_pageio_add_request(desc->pgio, new)) { + nfs_list_remove_request(new); + nfs_readpage_release(new); error = desc->pgio->pg_error; goto out_unlock; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7b9316406930..ce43cd6d88c6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -21,6 +21,8 @@ #include <linux/nfs_page.h> #include <linux/backing-dev.h> #include <linux/export.h> +#include <linux/freezer.h> +#include <linux/wait.h> #include <asm/uaccess.h> @@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc) { int ret = 0; if (wbc->for_reclaim) - return FLUSH_HIGHPRI | FLUSH_STABLE; + return FLUSH_HIGHPRI | FLUSH_COND_STABLE; if (wbc->sync_mode == WB_SYNC_ALL) ret = FLUSH_COND_STABLE; - if (wbc->for_kupdate || wbc->for_background) - ret |= FLUSH_LOWPRI; return ret; } @@ -545,12 +545,22 @@ try_again: return head; } +static void nfs_write_error_remove_page(struct nfs_page *req) +{ + nfs_unlock_request(req); + nfs_end_page_writeback(req); + nfs_release_request(req); + generic_error_remove_page(page_file_mapping(req->wb_page), + req->wb_page); +} + /* * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock) + struct page *page, bool nonblock, + bool launder) { struct nfs_page *req; int ret = 0; @@ -567,8 +577,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, ret = 0; if (!nfs_pageio_add_request(pgio, req)) { - nfs_redirty_request(req); ret = pgio->pg_error; + /* + * Remove the problematic req upon fatal errors + * in launder case, while other dirty pages can + * still be around until they get flushed. + */ + if (nfs_error_is_fatal(ret)) { + nfs_context_set_write_error(req->wb_context, ret); + if (launder) { + nfs_write_error_remove_page(req); + goto out; + } + } + nfs_redirty_request(req); + ret = -EAGAIN; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); @@ -576,12 +599,14 @@ out: return ret; } -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio, bool launder) { int ret; nfs_pageio_cond_complete(pgio, page_file_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, + launder); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -592,7 +617,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st /* * Write an mmapped page to the server. */ -static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) +static int nfs_writepage_locked(struct page *page, + struct writeback_control *wbc, + bool launder) { struct nfs_pageio_descriptor pgio; struct inode *inode = page_file_mapping(page)->host; @@ -601,7 +628,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio); + err = nfs_do_writepage(page, wbc, &pgio, launder); nfs_pageio_complete(&pgio); if (err < 0) return err; @@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) { int ret; - ret = nfs_writepage_locked(page, wbc); + ret = nfs_writepage_locked(page, wbc, false); unlock_page(page); return ret; } @@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * { int ret; - ret = nfs_do_writepage(page, wbc, data); + ret = nfs_do_writepage(page, wbc, data, false); unlock_page(page); return ret; } @@ -1128,7 +1155,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page) if (req == NULL) return 0; l_ctx = req->wb_lock_context; - do_flush = req->wb_page != page || req->wb_context != ctx; + do_flush = req->wb_page != page || + !nfs_match_open_context(req->wb_context, ctx); /* for now, flush if more than 1 request in page_group */ do_flush |= req->wb_this_page != req; if (l_ctx && flctx && @@ -1326,9 +1354,15 @@ static void nfs_async_write_error(struct list_head *head) } } +static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + nfs_async_write_error(&hdr->pages); +} + static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { .error_cleanup = nfs_async_write_error, .completion = nfs_write_completion, + .reschedule_io = nfs_async_write_reschedule_io, }; void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, @@ -1529,27 +1563,21 @@ static void nfs_writeback_result(struct rpc_task *task, } } - -static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +static int wait_on_commit(struct nfs_mds_commit_info *cinfo) { - int ret; + return wait_on_atomic_t(&cinfo->rpcs_out, + nfs_wait_atomic_killable, TASK_KILLABLE); +} - if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) - return 1; - if (!may_wait) - return 0; - ret = out_of_line_wait_on_bit_lock(&nfsi->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - return (ret < 0) ? ret : 1; +static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) +{ + atomic_inc(&cinfo->rpcs_out); } -static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +static void nfs_commit_end(struct nfs_mds_commit_info *cinfo) { - clear_bit(NFS_INO_COMMIT, &nfsi->flags); - smp_mb__after_atomic(); - wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); + if (atomic_dec_and_test(&cinfo->rpcs_out)) + wake_up_atomic_t(&cinfo->rpcs_out); } void nfs_commitdata_release(struct nfs_commit_data *data) @@ -1666,6 +1694,13 @@ void nfs_retry_commit(struct list_head *page_list, } EXPORT_SYMBOL_GPL(nfs_retry_commit); +static void +nfs_commit_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) +{ + __set_page_dirty_nobuffers(req->wb_page); +} + /* * Commit dirty pages */ @@ -1687,7 +1722,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, data->mds_ops, how, 0); out_bad: nfs_retry_commit(head, NULL, cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } @@ -1749,8 +1783,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); - if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) - nfs_commit_clear_lock(NFS_I(data->inode)); + nfs_commit_end(cinfo.mds); } static void nfs_commit_release(void *calldata) @@ -1769,7 +1802,7 @@ static const struct rpc_call_ops nfs_commit_ops = { static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { .completion = nfs_commit_release_pages, - .error_cleanup = nfs_commit_clear_lock, + .resched_write = nfs_commit_resched_write, }; int nfs_generic_commit_list(struct inode *inode, struct list_head *head, @@ -1788,30 +1821,25 @@ int nfs_commit_inode(struct inode *inode, int how) LIST_HEAD(head); struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; + int error = 0; int res; - res = nfs_commit_set_lock(NFS_I(inode), may_wait); - if (res <= 0) - goto out_mark_dirty; nfs_init_cinfo_from_inode(&cinfo, inode); + nfs_commit_begin(cinfo.mds); res = nfs_scan_commit(inode, &head, &cinfo); - if (res) { - int error; - + if (res) error = nfs_generic_commit_list(inode, &head, how, &cinfo); - if (error < 0) - return error; - if (!may_wait) - goto out_mark_dirty; - error = wait_on_bit_action(&NFS_I(inode)->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - if (error < 0) - return error; - } else - nfs_commit_clear_lock(NFS_I(inode)); + nfs_commit_end(cinfo.mds); + if (error < 0) + goto out_error; + if (!may_wait) + goto out_mark_dirty; + error = wait_on_commit(cinfo.mds); + if (error < 0) + return error; return res; +out_error: + res = error; /* Note: If we exit without ensuring that the commit is complete, * we must mark the inode as dirty. Otherwise, future calls to * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure @@ -1821,6 +1849,7 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return res; } +EXPORT_SYMBOL_GPL(nfs_commit_inode); int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { @@ -1911,7 +1940,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* * Write back all requests on one page - we do this before reading it. */ -int nfs_wb_page(struct inode *inode, struct page *page) +int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) { loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); @@ -1928,7 +1957,7 @@ int nfs_wb_page(struct inode *inode, struct page *page) for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + ret = nfs_writepage_locked(page, &wbc, launder); if (ret < 0) goto out_error; continue; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 77e7a5cca888..1a03bc3059e8 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -58,7 +58,7 @@ nlm_fclose(struct file *filp) fput(filp); } -static struct nlmsvc_binding nfsd_nlm_ops = { +static const struct nlmsvc_binding nfsd_nlm_ops = { .fopen = nlm_fopen, /* open file for locking */ .fclose = nlm_fclose, /* close file */ }; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index d8b16c2568f3..5fbf3bbd00d0 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -92,7 +92,7 @@ struct nfsd_net { struct file *rec_file; bool in_grace; - struct nfsd4_client_tracking_ops *client_tracking_ops; + const struct nfsd4_client_tracking_ops *client_tracking_ops; time_t nfsd4_lease; time_t nfsd4_grace; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index e7f50c4081d6..7389cb1d7409 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -792,12 +792,16 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason) static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) { + if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) + return; clp->cl_cb_state = NFSD4_CB_DOWN; warn_no_callback_path(clp, reason); } static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) { + if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) + return; clp->cl_cb_state = NFSD4_CB_FAULT; warn_no_callback_path(clp, reason); } @@ -1143,7 +1147,7 @@ nfsd4_run_cb_work(struct work_struct *work) } void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, - struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) + const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) { cb->cb_clp = clp; cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index c9d6c715c0fb..ce2d010d3b17 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -22,7 +22,7 @@ struct nfs4_layout { static struct kmem_cache *nfs4_layout_cache; static struct kmem_cache *nfs4_layout_stateid_cache; -static struct nfsd4_callback_ops nfsd4_cb_layout_ops; +static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lock_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { @@ -624,24 +624,39 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_layout_stateid *ls = container_of(cb, struct nfs4_layout_stateid, ls_recall); + struct nfsd_net *nn; + ktime_t now, cutoff; LIST_HEAD(reaplist); + switch (task->tk_status) { case 0: - return 1; + case -NFS4ERR_DELAY: + /* + * Anything left? If not, then call it done. Note that we don't + * take the spinlock since this is an optimization and nothing + * should get added until the cb counter goes to zero. + */ + if (list_empty(&ls->ls_layouts)) + return 1; + + /* Poll the client until it's done with the layout */ + now = ktime_get(); + nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id); + + /* Client gets 2 lease periods to return it */ + cutoff = ktime_add_ns(task->tk_start, + nn->nfsd4_lease * NSEC_PER_SEC * 2); + + if (ktime_before(now, cutoff)) { + rpc_delay(task, HZ/100); /* 10 mili-seconds */ + return 0; + } + /* Fallthrough */ case -NFS4ERR_NOMATCHING_LAYOUT: trace_layout_recall_done(&ls->ls_stid.sc_stateid); task->tk_status = 0; return 1; - case -NFS4ERR_DELAY: - /* Poll the client until it's done with the layout */ - /* FIXME: cap number of retries. - * The pnfs standard states that we need to only expire - * the client after at-least "lease time" .eg lease-time * 2 - * when failing to communicate a recall - */ - rpc_delay(task, HZ/100); /* 10 mili-seconds */ - return 0; default: /* * Unknown error or non-responding client, we'll need to fence. @@ -665,7 +680,7 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb) nfs4_put_stid(&ls->ls_stid); } -static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { +static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = { .prepare = nfsd4_cb_layout_prepare, .done = nfsd4_cb_layout_done, .release = nfsd4_cb_layout_release, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e3d47091b191..79f0307a5ec8 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -631,7 +631,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp) return -ENOENT; } -static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { +static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .init = nfsd4_legacy_tracking_init, .exit = nfsd4_legacy_tracking_exit, .create = nfsd4_create_clid_dir, @@ -1050,7 +1050,7 @@ out_err: printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); } -static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .init = nfsd4_init_cld_pipe, .exit = nfsd4_remove_cld_pipe, .create = nfsd4_cld_create, @@ -1394,7 +1394,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) kfree(legacy); } -static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { +static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .init = nfsd4_umh_cltrack_init, .exit = NULL, .create = nfsd4_umh_cltrack_create, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index df5dba687265..c484a2b6cd10 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -98,7 +98,7 @@ static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); -static struct nfsd4_callback_ops nfsd4_cb_recall_ops; +static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static bool is_session_dead(struct nfsd4_session *ses) { @@ -1857,15 +1857,28 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) target->cl_clientid.cl_id = source->cl_clientid.cl_id; } -static int copy_cred(struct svc_cred *target, struct svc_cred *source) +int strdup_if_nonnull(char **target, char *source) { - if (source->cr_principal) { - target->cr_principal = - kstrdup(source->cr_principal, GFP_KERNEL); - if (target->cr_principal == NULL) + if (source) { + *target = kstrdup(source, GFP_KERNEL); + if (!*target) return -ENOMEM; } else - target->cr_principal = NULL; + *target = NULL; + return 0; +} + +static int copy_cred(struct svc_cred *target, struct svc_cred *source) +{ + int ret; + + ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal); + if (ret) + return ret; + ret = strdup_if_nonnull(&target->cr_raw_principal, + source->cr_raw_principal); + if (ret) + return ret; target->cr_flavor = source->cr_flavor; target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; @@ -1969,6 +1982,9 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) return false; if (!svc_rqst_integrity_protected(rqstp)) return false; + if (cl->cl_cred.cr_raw_principal) + return 0 == strcmp(cl->cl_cred.cr_raw_principal, + cr->cr_raw_principal); if (!cr->cr_principal) return false; return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); @@ -2240,7 +2256,8 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) base = resp->cstate.data_offset; slot->sl_datalen = buf->len - base; if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) - WARN("%s: sessions DRC could not cache compound\n", __func__); + WARN(1, "%s: sessions DRC could not cache compound\n", + __func__); return; } @@ -2365,10 +2382,27 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; + new = create_client(exid->clname, rqstp, &verf); + if (new == NULL) + return nfserr_jukebox; + switch (exid->spa_how) { case SP4_MACH_CRED: - if (!svc_rqst_integrity_protected(rqstp)) - return nfserr_inval; + if (!svc_rqst_integrity_protected(rqstp)) { + status = nfserr_inval; + goto out_nolock; + } + /* + * Sometimes userspace doesn't give us a principal. + * Which is a bug, really. Anyway, we can't enforce + * MACH_CRED in that case, better to give up now: + */ + if (!new->cl_cred.cr_principal && + !new->cl_cred.cr_raw_principal) { + status = nfserr_serverfault; + goto out_nolock; + } + new->cl_mach_cred = true; case SP4_NONE: break; default: /* checked by xdr code */ @@ -2377,10 +2411,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, return nfserr_encr_alg_unsupp; } - new = create_client(exid->clname, rqstp, &verf); - if (new == NULL) - return nfserr_jukebox; - /* Cases below refer to rfc 5661 section 18.35.4: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&exid->clname, nn); @@ -2442,7 +2472,6 @@ out_new: goto out; } new->cl_minorversion = cstate->minorversion; - new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED); gen_clid(new, nn); add_to_unconfirmed(new); @@ -2460,6 +2489,7 @@ out_copy: out: spin_unlock(&nn->client_lock); +out_nolock: if (new) expire_client(new); if (unconf) @@ -3648,7 +3678,7 @@ static void nfsd4_cb_recall_release(struct nfsd4_callback *cb) nfs4_put_stid(&dp->dl_stid); } -static struct nfsd4_callback_ops nfsd4_cb_recall_ops = { +static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { .prepare = nfsd4_cb_recall_prepare, .done = nfsd4_cb_recall_done, .release = nfsd4_cb_recall_release, @@ -4541,8 +4571,7 @@ static void laundromat_main(struct work_struct *laundry) { time_t t; - struct delayed_work *dwork = container_of(laundry, struct delayed_work, - work); + struct delayed_work *dwork = to_delayed_work(laundry); struct nfsd_net *nn = container_of(dwork, struct nfsd_net, laundromat_work); diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 2087bae17582..0770bcb543c8 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -7,6 +7,7 @@ #ifndef _LINUX_NFSD_NFSFH_H #define _LINUX_NFSD_NFSFH_H +#include <linux/crc32.h> #include <linux/sunrpc/svc.h> #include <uapi/linux/nfsd/nfsfh.h> @@ -205,6 +206,28 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) return true; } +#ifdef CONFIG_CRC32 +/** + * knfsd_fh_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ + +static inline u32 +knfsd_fh_hash(struct knfsd_fh *fh) +{ + return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size); +} +#else +static inline u32 +knfsd_fh_hash(struct knfsd_fh *fh) +{ + return 0; +} +#endif + #ifdef CONFIG_NFSD_V3 /* * The wcc data stored in current_fh should be cleared diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ad4e2377dd63..45007acaf364 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -14,9 +14,13 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/seq_file.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#include <net/ipv6.h> #include <net/net_namespace.h> #include "nfsd.h" #include "cache.h" @@ -306,22 +310,81 @@ static void nfsd_shutdown_net(struct net *net) nfsd_shutdown_generic(); } +static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct sockaddr_in sin; + + if (event != NETDEV_DOWN) + goto out; + + if (nn->nfsd_serv) { + dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ifa->ifa_local; + svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block nfsd_inetaddr_notifier = { + .notifier_call = nfsd_inetaddr_event, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int nfsd_inet6addr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + struct net_device *dev = ifa->idev->dev; + struct net *net = dev_net(dev); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct sockaddr_in6 sin6; + + if (event != NETDEV_DOWN) + goto out; + + if (nn->nfsd_serv) { + dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ifa->addr; + svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block nfsd_inet6addr_notifier = { + .notifier_call = nfsd_inet6addr_event, +}; +#endif + static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); +#endif /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are * started, then nfsd_last_thread will be run before any of this - * other initialization has been done. + * other initialization has been done except the rpcb information. */ + svc_rpcb_cleanup(serv, net); if (!nn->nfsd_net_up) return; - nfsd_shutdown_net(net); - - svc_rpcb_cleanup(serv, net); + nfsd_shutdown_net(net); printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); nfsd_export_flush(net); @@ -425,6 +488,10 @@ int nfsd_create_serv(struct net *net) } set_max_drc(); + register_inetaddr_notifier(&nfsd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + register_inet6addr_notifier(&nfsd_inet6addr_notifier); +#endif do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 99432b7ecb9c..c050c53036a6 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -65,7 +65,7 @@ struct nfsd4_callback { struct nfs4_client *cb_clp; u32 cb_minorversion; struct rpc_message cb_msg; - struct nfsd4_callback_ops *cb_ops; + const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; int cb_seq_status; int cb_status; @@ -599,7 +599,7 @@ extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, - struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); + const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); extern void nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 0befe762762b..3287041905da 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -8,6 +8,47 @@ #define _NFSD_TRACE_H #include <linux/tracepoint.h> +#include "nfsfh.h" + +DECLARE_EVENT_CLASS(nfsd_io_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + loff_t offset, + int len), + TP_ARGS(rqstp, fhp, offset, len), + TP_STRUCT__entry( + __field(__be32, xid) + __field_struct(struct knfsd_fh, fh) + __field(loff_t, offset) + __field(int, len) + ), + TP_fast_assign( + __entry->xid = rqstp->rq_xid, + fh_copy_shallow(&__entry->fh, &fhp->fh_handle); + __entry->offset = offset; + __entry->len = len; + ), + TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d", + __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh), + __entry->offset, __entry->len) +) + +#define DEFINE_NFSD_IO_EVENT(name) \ +DEFINE_EVENT(nfsd_io_class, name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + loff_t offset, \ + int len), \ + TP_ARGS(rqstp, fhp, offset, len)) + +DEFINE_NFSD_IO_EVENT(read_start); +DEFINE_NFSD_IO_EVENT(read_opened); +DEFINE_NFSD_IO_EVENT(read_io_done); +DEFINE_NFSD_IO_EVENT(read_done); +DEFINE_NFSD_IO_EVENT(write_start); +DEFINE_NFSD_IO_EVENT(write_opened); +DEFINE_NFSD_IO_EVENT(write_io_done); +DEFINE_NFSD_IO_EVENT(write_done); #include "state.h" diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d41c149fae75..6739077f17fe 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -43,6 +43,7 @@ #include "nfsd.h" #include "vfs.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP @@ -997,16 +998,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct raparms *ra; __be32 err; + trace_read_start(rqstp, fhp, offset, vlen); err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (err) return err; ra = nfsd_init_raparms(file); + + trace_read_opened(rqstp, fhp, offset, vlen); err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + trace_read_io_done(rqstp, fhp, offset, vlen); + if (ra) nfsd_put_raparams(file, ra); fput(file); + trace_read_done(rqstp, fhp, offset, vlen); + return err; } @@ -1022,24 +1030,31 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, { __be32 err = 0; + trace_write_start(rqstp, fhp, offset, vlen); + if (file) { err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; + trace_write_opened(rqstp, fhp, offset, vlen); err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); + trace_write_io_done(rqstp, fhp, offset, vlen); } else { err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); if (err) goto out; + trace_write_opened(rqstp, fhp, offset, vlen); if (cnt) err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); + trace_write_io_done(rqstp, fhp, offset, vlen); fput(file); } out: + trace_write_done(rqstp, fhp, offset, vlen); return err; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c7343844e6b6..7f5d3d9f1c37 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1416,7 +1416,8 @@ static int __init nilfs_init_cachep(void) { nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", sizeof(struct nilfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + nilfs_inode_init_once); if (!nilfs_inode_cachep) goto fail; diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index e785fd954c30..741077deef3b 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb) break; } spin_unlock(&next_i->i_lock); - next_i = list_entry(next_i->i_sb_list.next, - struct inode, i_sb_list); + next_i = list_next_entry(next_i, i_sb_list); } /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index fc0df4442f7b..cfcbf114676e 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -92,9 +92,6 @@ #include "fsnotify.h" struct srcu_struct fsnotify_mark_srcu; -static DEFINE_SPINLOCK(destroy_lock); -static LIST_HEAD(destroy_list); -static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq); void fsnotify_get_mark(struct fsnotify_mark *mark) { @@ -168,10 +165,19 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) atomic_dec(&group->num_marks); } +static void +fsnotify_mark_free_rcu(struct rcu_head *rcu) +{ + struct fsnotify_mark *mark; + + mark = container_of(rcu, struct fsnotify_mark, g_rcu); + fsnotify_put_mark(mark); +} + /* - * Free fsnotify mark. The freeing is actually happening from a kthread which - * first waits for srcu period end. Caller must have a reference to the mark - * or be protected by fsnotify_mark_srcu. + * Free fsnotify mark. The freeing is actually happening from a call_srcu + * callback. Caller must have a reference to the mark or be protected by + * fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { @@ -186,10 +192,7 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - wake_up(&destroy_waitq); + call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); /* * Some groups like to know that marks are being freed. This is a @@ -385,11 +388,7 @@ err: spin_unlock(&mark->lock); - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - wake_up(&destroy_waitq); - + call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); return ret; } @@ -492,40 +491,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, atomic_set(&mark->refcnt, 1); mark->free_mark = free_mark; } - -static int fsnotify_mark_destroy(void *ignored) -{ - struct fsnotify_mark *mark, *next; - struct list_head private_destroy_list; - - for (;;) { - spin_lock(&destroy_lock); - /* exchange the list head */ - list_replace_init(&destroy_list, &private_destroy_list); - spin_unlock(&destroy_lock); - - synchronize_srcu(&fsnotify_mark_srcu); - - list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { - list_del_init(&mark->g_list); - fsnotify_put_mark(mark); - } - - wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list)); - } - - return 0; -} - -static int __init fsnotify_mark_init(void) -{ - struct task_struct *thread; - - thread = kthread_run(fsnotify_mark_destroy, NULL, - "fsnotify_mark"); - if (IS_ERR(thread)) - panic("unable to start fsnotify mark destruction thread."); - - return 0; -} -device_initcall(fsnotify_mark_init); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index d1a853585b53..2f77f8dfb861 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void) ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, sizeof(big_ntfs_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - ntfs_big_inode_init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 86181d6526dc..a3ded88718c9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); -static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, .eo_update_clusters = ocfs2_dinode_update_clusters, @@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_value_update_clusters, @@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_tree_update_clusters, @@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) et->et_root_el = &dx_root->dr_list; } -static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, .eo_update_clusters = ocfs2_dx_root_update_clusters, @@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, return CONTIG_NONE; } -static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_refcount_tree_update_clusters, @@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct buffer_head *bh, ocfs2_journal_access_func access, void *obj, - struct ocfs2_extent_tree_operations *ops) + const struct ocfs2_extent_tree_operations *ops) { et->et_ops = ops; et->et_root_bh = bh; @@ -6174,8 +6174,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, } bail: - if (tl_inode) - iput(tl_inode); + iput(tl_inode); brelse(tl_bh); if (status < 0) { diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index fb09b97db162..f3dc1b0dfffc 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -54,7 +54,7 @@ */ struct ocfs2_extent_tree_operations; struct ocfs2_extent_tree { - struct ocfs2_extent_tree_operations *et_ops; + const struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; struct ocfs2_caching_info *et_ci; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 709fbbd44c65..a3cc6d2fc896 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1780,8 +1780,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, } ++live_threshold; atomic_set(®->hr_steady_iterations, live_threshold); - /* unsteady_iterations is double the steady_iterations */ - atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); + /* unsteady_iterations is triple the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold * 3)); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e88ccf8c83ff..68c607e63ff6 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -376,17 +376,6 @@ struct dlm_lock lksb_kernel_allocated:1; }; - -#define DLM_LKSB_UNUSED1 0x01 -#define DLM_LKSB_PUT_LVB 0x02 -#define DLM_LKSB_GET_LVB 0x04 -#define DLM_LKSB_UNUSED2 0x08 -#define DLM_LKSB_UNUSED3 0x10 -#define DLM_LKSB_UNUSED4 0x20 -#define DLM_LKSB_UNUSED5 0x40 -#define DLM_LKSB_UNUSED6 0x80 - - enum dlm_lockres_list { DLM_GRANTED_LIST = 0, DLM_CONVERTING_LIST = 1, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 84f2f8079466..9477d6e1de37 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2388,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) spin_lock(&res->spinlock); BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); if (test_bit(node, res->refmap)) { - __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } @@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + /* get an extra reference on the mle. + * otherwise the assert_master from the new + * master will destroy this. + */ + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -2544,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (oldmle) { + if (ret != -EEXIST && oldmle) { /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, oldmle); dlm_put_mle(oldmle); @@ -2554,6 +2559,7 @@ fail: if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); + dlm_put_mle_inuse(mle); } else if (mle) { kmem_cache_free(dlm_mle_cache, mle); mle = NULL; @@ -2571,17 +2577,6 @@ fail: * ensure that all assert_master work is flushed. */ flush_workqueue(dlm->dlm_worker); - /* get an extra reference on the mle. - * otherwise the assert_master from the new - * master will destroy this. - * also, make sure that all callers of dlm_get_mle - * take both dlm->spinlock and dlm->master_lock */ - spin_lock(&dlm->spinlock); - spin_lock(&dlm->master_lock); - dlm_get_mle_inuse(mle); - spin_unlock(&dlm->master_lock); - spin_unlock(&dlm->spinlock); - /* notify new node and send all lock state */ /* call send_one_lockres with migration flag. * this serves as notice to the target node that a @@ -3050,7 +3045,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, int ret = 0; if (!dlm_grab(dlm)) - return -EINVAL; + return 0; name = migrate->name; namelen = migrate->namelen; @@ -3141,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, mlog(0, "tried to migrate %.*s, but some " "process beat me to it\n", namelen, name); - ret = -EEXIST; + spin_unlock(&tmp->spinlock); + return -EEXIST; } else { /* bad. 2 NODES are trying to migrate! */ mlog(ML_ERROR, "migration error mle: " @@ -3312,6 +3308,15 @@ top: mle->new_master != dead_node) continue; + if (mle->new_master == dead_node && mle->inuse) { + mlog(ML_NOTICE, "%s: target %u died during " + "migration from %u, the MLE is " + "still keep used, ignore it!\n", + dlm->name, dead_node, + mle->master); + continue; + } + /* If we have reached this point, this mle needs to be * removed from the list and freed. */ dlm_clean_migration_mle(dlm, mle); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 9e4f862d20fe..c5bdf02c213b 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, char *buf = NULL; struct dlm_work_item *item = NULL; struct dlm_lock_resource *res = NULL; + unsigned int hash; if (!dlm_grab(dlm)) return -EINVAL; @@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, /* lookup the lock to see if we have a secondary queue for this * already... just add the locks in and this will have its owner * and RECOVERY flag changed when it completes. */ - res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); + hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len, + hash); if (res) { /* this will get a ref on res */ /* mark it as recovering/migrating and hash it */ @@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; } spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); } else { + spin_unlock(&dlm->spinlock); /* need to allocate, just like if it was * mastered here normally */ res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); @@ -2450,11 +2457,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) * perhaps later we can genericize this for other waiters. */ wake_up(&dlm->migration_wq); - if (test_bit(idx, dlm->recovery_map)) - mlog(0, "domain %s, node %u already added " - "to recovery map!\n", dlm->name, idx); - else - set_bit(idx, dlm->recovery_map); + set_bit(idx, dlm->recovery_map); } void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 2e3c9dbab68c..1082b2c3014b 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } if (!dlm_grab(dlm)) - return DLM_REJECTED; + return DLM_FORWARD; mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), "Domain %s not fully joined!\n", dlm->name); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b5cf27dcb18a..03768bb3aab1 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void) dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), dlmfs_init_once); if (!dlmfs_inode_cache) { status = -ENOMEM; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 20276e340339..f92612e4b9d6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2432,12 +2432,6 @@ bail: * done this we have to return AOP_TRUNCATED_PAGE so the aop method * that called us can bubble that back up into the VFS who will then * immediately retry the aop call. - * - * We do a blocking lock and immediate unlock before returning, though, so that - * the lock has a great chance of being cached on this node by the time the VFS - * calls back to retry the aop. This has a potential to livelock as nodes - * ping locks back and forth, but that's a risk we're willing to take to avoid - * the lock inversion simply. */ int ocfs2_inode_lock_with_page(struct inode *inode, struct buffer_head **ret_bh, @@ -2449,8 +2443,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode, ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); - if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) - ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0e5b4515f92e..d63127932509 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt, } generic_fillattr(inode, stat); + /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others don't incorrectly think the file is completely sparse. + */ + if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + stat->blocks += (stat->size + 511)>>9; /* We set the blksize from the cluster size for performance */ stat->blksize = osb->s_clustersize; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 3cb097ccce60..16b0bb482ea7 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -606,9 +606,7 @@ bail: if (gb_inode) mutex_unlock(&gb_inode->i_mutex); - if (gb_inode) - iput(gb_inode); - + iput(gb_inode); brelse(bh); return status; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 13534f4fe5b5..3772a2dbb980 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) // up_write(&journal->j_trans_barrier); done: - if (inode) - iput(inode); + iput(inode); } static void ocfs2_clear_journal_error(struct super_block *sb, @@ -1687,9 +1686,7 @@ done: if (got_lock) ocfs2_inode_unlock(inode, 1); - if (inode) - iput(inode); - + iput(inode); brelse(bh); return status; @@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb, ocfs2_inode_unlock(inode, 1); bail: - if (inode) - iput(inode); + iput(inode); return status; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 0a4457fb0711..e9c99e35f5ea 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) brelse(alloc_bh); - if (inode) - iput(inode); + iput(inode); trace_ocfs2_load_local_alloc(osb->local_alloc_bits); @@ -473,8 +472,7 @@ out_mutex: iput(main_bm_inode); out: - if (local_alloc_inode) - iput(local_alloc_inode); + iput(local_alloc_inode); kfree(alloc_copy); } @@ -1327,9 +1325,7 @@ bail: brelse(main_bm_bh); - if (main_bm_inode) - iput(main_bm_inode); - + iput(main_bm_inode); kfree(alloc_copy); if (ac) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index afb81eae2c18..ab42c38031b1 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1683,8 +1683,7 @@ bail: if (new_inode) sync_mapping_buffers(old_inode->i_mapping); - if (new_inode) - iput(new_inode); + iput(new_inode); ocfs2_free_dir_lookup_result(&target_lookup_res); ocfs2_free_dir_lookup_result(&old_entry_lookup); @@ -2373,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, name, strlen(name)); + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + /* find it's spot in the orphan directory */ status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, &lookup); @@ -2388,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle, - INODE_CACHE(orphan_dir_inode), - orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* do the i_nlink dance! :) */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index b6d51333ad02..d153e6e31529 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -82,7 +82,7 @@ struct ocfs2_quota_chunk { extern struct kmem_cache *ocfs2_dquot_cachep; extern struct kmem_cache *ocfs2_qf_chunk_cachep; -extern struct qtree_fmt_operations ocfs2_global_ops; +extern const struct qtree_fmt_operations ocfs2_global_ops; struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index c93d67220887..fde9ef18cff3 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot) dquot->dq_id); } -struct qtree_fmt_operations ocfs2_global_ops = { +const struct qtree_fmt_operations ocfs2_global_ops = { .mem2disk_dqblk = ocfs2_global_mem2diskdqb, .disk2mem_dqblk = ocfs2_global_disk2memdqb, .is_id = ocfs2_global_is_id, diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index e78a203d44c8..1e09592148ad 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) if (si == NULL) return; - if (si->si_inode) - iput(si->si_inode); + iput(si->si_inode); if (si->si_bh) { for (i = 0; i < si->si_blocks; i++) { if (si->si_bh[i]) { @@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb) trace_ocfs2_find_slot(osb->slot_num); status = ocfs2_update_disk_slot(osb, si, osb->slot_num); - if (status < 0) + if (status < 0) { mlog_errno(status); + /* + * if write block failed, invalidate slot to avoid overwrite + * slot during dismount in case another node rightly has mounted + */ + spin_lock(&osb->osb_lock); + ocfs2_invalidate_slot(si, osb->slot_num); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + } bail: return status; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2de4c8a9340c..faa1365097bc 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1280,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb, int status, user_stack = 0; char *p; u32 tmp; + int token, option; + substring_t args[MAX_OPT_ARGS]; trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); @@ -1298,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb, } while ((p = strsep(&options, ",")) != NULL) { - int token, option; - substring_t args[MAX_OPT_ARGS]; - if (!*p) continue; @@ -1367,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->atime_quantum = option; break; case Opt_slot: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1376,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->slot = (s16)option; break; case Opt_commit: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1388,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->commit_interval = HZ * option; break; case Opt_localalloc: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1726,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) ocfs2_inode_unlock(inode, 0); status = 0; bail: - if (inode) - iput(inode); + iput(inode); if (status) mlog_errno(status); @@ -1771,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void) sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 15e4500cda3e..b61b883c8ff8 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -443,7 +443,7 @@ static int __init init_openprom_fs(void) sizeof(struct op_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), op_inode_init_once); if (!op_inode_cachep) return -ENOMEM; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d0e9b9b6223e..42305ddcbaa0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -95,7 +95,8 @@ void __init proc_init_inodecache(void) proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_PANIC), init_once); } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 9155a5a0d3b9..df4661abadc4 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -57,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) /* * Estimate the amount of memory available for userspace allocations, * without causing swapping. - * - * Free memory cannot be taken below the low watermark, before the - * system starts swapping. */ - available = i.freeram - wmark_low; + available = i.freeram - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will diff --git a/fs/proc/page.c b/fs/proc/page.c index 93484034a03d..b2855eea5405 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page) * pseudo flags for the well known (anonymous) memory mapped pages * * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the - * simple test in page_mapped() is not enough. + * simple test in page_mapcount() is not enough. */ - if (!PageSlab(page) && page_mapped(page)) + if (!PageSlab(page) && page_mapcount(page)) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 187b3b5f242e..65a1b6c69c11 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,6 +14,7 @@ #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> +#include <linux/shmem_fs.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -22,9 +23,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap, ptes, pmds; + unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + anon = get_mm_counter(mm, MM_ANONPAGES); + file = get_mm_counter(mm, MM_FILEPAGES); + shmem = get_mm_counter(mm, MM_SHMEMPAGES); + /* * Note: to minimize their overhead, mm maintains hiwater_vm and * hiwater_rss only when about to *lower* total_vm or rss. Any @@ -35,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) hiwater_vm = total_vm = mm->total_vm; if (hiwater_vm < mm->hiwater_vm) hiwater_vm = mm->hiwater_vm; - hiwater_rss = total_rss = get_mm_rss(mm); + hiwater_rss = total_rss = anon + file + shmem; if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); @@ -52,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPin:\t%8lu kB\n" "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" + "RssAnon:\t%8lu kB\n" + "RssFile:\t%8lu kB\n" + "RssShmem:\t%8lu kB\n" "VmData:\t%8lu kB\n" "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" @@ -65,7 +72,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), total_rss << (PAGE_SHIFT-10), - data << (PAGE_SHIFT-10), + anon << (PAGE_SHIFT-10), + file << (PAGE_SHIFT-10), + shmem << (PAGE_SHIFT-10), + mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, ptes >> 10, pmds >> 10, @@ -82,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES); + *shared = get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; - *data = mm->total_vm - mm->shared_vm; + *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } @@ -451,12 +462,14 @@ struct mem_size_stats { unsigned long private_hugetlb; u64 pss; u64 swap_pss; + bool check_shmem_swap; }; static void smaps_account(struct mem_size_stats *mss, struct page *page, - unsigned long size, bool young, bool dirty) + bool compound, bool young, bool dirty) { - int mapcount; + int i, nr = compound ? HPAGE_PMD_NR : 1; + unsigned long size = nr * PAGE_SIZE; if (PageAnon(page)) mss->anonymous += size; @@ -465,26 +478,53 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, /* Accumulate the size in pages that have been accessed. */ if (young || page_is_young(page) || PageReferenced(page)) mss->referenced += size; - mapcount = page_mapcount(page); - if (mapcount >= 2) { - u64 pss_delta; - if (dirty || PageDirty(page)) - mss->shared_dirty += size; - else - mss->shared_clean += size; - pss_delta = (u64)size << PSS_SHIFT; - do_div(pss_delta, mapcount); - mss->pss += pss_delta; - } else { + /* + * page_count(page) == 1 guarantees the page is mapped exactly once. + * If any subpage of the compound page mapped with PTE it would elevate + * page_count(). + */ + if (page_count(page) == 1) { if (dirty || PageDirty(page)) mss->private_dirty += size; else mss->private_clean += size; mss->pss += (u64)size << PSS_SHIFT; + return; + } + + for (i = 0; i < nr; i++, page++) { + int mapcount = page_mapcount(page); + + if (mapcount >= 2) { + if (dirty || PageDirty(page)) + mss->shared_dirty += PAGE_SIZE; + else + mss->shared_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; + } else { + if (dirty || PageDirty(page)) + mss->private_dirty += PAGE_SIZE; + else + mss->private_clean += PAGE_SIZE; + mss->pss += PAGE_SIZE << PSS_SHIFT; + } } } +#ifdef CONFIG_SHMEM +static int smaps_pte_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + + mss->swap += shmem_partial_swap_usage( + walk->vma->vm_file->f_mapping, addr, end); + + return 0; +} +#endif + static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { @@ -512,11 +552,25 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap + && pte_none(*pte))) { + page = find_get_entry(vma->vm_file->f_mapping, + linear_page_index(vma, addr)); + if (!page) + return; + + if (radix_tree_exceptional_entry(page)) + mss->swap += PAGE_SIZE; + else + page_cache_release(page); + + return; } if (!page) return; - smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); + + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -532,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, if (IS_ERR_OR_NULL(page)) return; mss->anonymous_thp += HPAGE_PMD_SIZE; - smaps_account(mss, page, HPAGE_PMD_SIZE, - pmd_young(*pmd), pmd_dirty(*pmd)); + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -549,7 +602,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; @@ -671,6 +724,31 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) }; memset(&mss, 0, sizeof mss); + +#ifdef CONFIG_SHMEM + if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { + /* + * For shared or readonly shmem mappings we know that all + * swapped out pages belong to the shmem object, and we can + * obtain the swap value much more efficiently. For private + * writable mappings, we might have COW pages that are + * not affected by the parent swapped out pages of the shmem + * object, so we have to distinguish them during the page walk. + * Unless we know that the shmem object (or the part mapped by + * our VMA) has no swapped out pages at all. + */ + unsigned long shmem_swapped = shmem_swap_usage(vma); + + if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE)) { + mss.swap = shmem_swapped; + } else { + mss.check_shmem_swap = true; + smaps_walk.pte_hole = smaps_pte_hole; + } + } +#endif + /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); @@ -817,9 +895,6 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); - if (vma->vm_flags & VM_SOFTDIRTY) - vma->vm_flags &= ~VM_SOFTDIRTY; - set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } #else @@ -838,7 +913,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; @@ -1112,7 +1187,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmdp, vma, &ptl)) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; @@ -1444,7 +1519,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl)) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index f37b3deb01b4..3a67cfb142d8 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -365,7 +365,7 @@ static int init_inodecache(void) qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", sizeof(struct qnx4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (qnx4_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 9728b5499e1d..47bb1de07155 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -625,7 +625,7 @@ static int init_inodecache(void) qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ef0d64b2a6d9..fbd70af98820 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2924,4 +2924,4 @@ static int __init dquot_init(void) return 0; } -module_init(dquot_init); +fs_initcall(dquot_init); diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index bb2869f5dfd8..d07a2f91d858 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -1,7 +1,5 @@ - #include <linux/cred.h> #include <linux/init.h> -#include <linux/module.h> #include <linux/kernel.h> #include <linux/quotaops.h> #include <linux/sched.h> @@ -105,5 +103,4 @@ static int __init quota_init(void) "VFS: Failed to create quota netlink interface.\n"); return 0; }; - -module_init(quota_init); +fs_initcall(quota_init); diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 2aa012a68e90..ed85d4f35c04 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -30,13 +30,13 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot); static void v2r1_disk2memdqb(struct dquot *dquot, void *dp); static int v2r1_is_id(void *dp, struct dquot *dquot); -static struct qtree_fmt_operations v2r0_qtree_ops = { +static const struct qtree_fmt_operations v2r0_qtree_ops = { .mem2disk_dqblk = v2r0_mem2diskdqb, .disk2mem_dqblk = v2r0_disk2memdqb, .is_id = v2r0_is_id, }; -static struct qtree_fmt_operations v2r1_qtree_ops = { +static const struct qtree_fmt_operations v2r1_qtree_ops = { .mem2disk_dqblk = v2r1_mem2diskdqb, .disk2mem_dqblk = v2r1_disk2memdqb, .is_id = v2r1_is_id, diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 4a62fe8cc3bf..05db7473bcb5 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -626,7 +626,8 @@ static int __init init_inodecache(void) sizeof(struct reiserfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD| + SLAB_ACCOUNT), init_once); if (reiserfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index bb894e78a821..6b00ca357c58 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -619,8 +619,8 @@ static int __init init_romfs_fs(void) romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - romfs_i_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index dded920cbc8f..5e79bfa4f260 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -419,7 +419,8 @@ static int __init init_inodecache(void) { squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", sizeof(struct squashfs_inode_info), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + init_once); return squashfs_inode_cachep ? 0 : -ENOMEM; } diff --git a/fs/stat.c b/fs/stat.c index d4a61d8dc021..bc045c7994e1 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x) +#define valid_dev(x) choose_32_64(old_valid_dev(x),true) #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) #ifndef INIT_STRUCT_STAT_PADDING diff --git a/fs/super.c b/fs/super.c index cc658a20a29e..1182af8fd5ff 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1197,7 +1197,7 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) else ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); - WARN_ON(force_trylock & !ret); + WARN_ON(force_trylock && !ret); return ret; } EXPORT_SYMBOL(__sb_start_write); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 07ac18c355e7..d62c423a5a2d 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -346,7 +346,7 @@ int __init sysv_init_icache(void) { sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (!sysv_inode_cachep) return -ENOMEM; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1fd90c079537..a233ba913be4 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2248,8 +2248,8 @@ static int __init ubifs_init(void) ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, - SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, - &inode_slab_ctor); + SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) return -ENOMEM; diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 6d6a96b4e73f..e0fd65fe73e8 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -447,9 +447,6 @@ static void udf_table_free_blocks(struct super_block *sb, */ int adsize; - struct short_ad *sad = NULL; - struct long_ad *lad = NULL; - struct allocExtDesc *aed; eloc.logicalBlockNum = start; elen = EXT_RECORDED_ALLOCATED | @@ -466,102 +463,17 @@ static void udf_table_free_blocks(struct super_block *sb, } if (epos.offset + (2 * adsize) > sb->s_blocksize) { - unsigned char *sptr, *dptr; - int loffset; - - brelse(oepos.bh); - oepos = epos; - /* Steal a block from the extent being free'd */ - epos.block.logicalBlockNum = eloc.logicalBlockNum; + udf_setup_indirect_aext(table, eloc.logicalBlockNum, + &epos); + eloc.logicalBlockNum++; elen -= sb->s_blocksize; - - epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, &epos.block, 0)); - if (!epos.bh) { - brelse(oepos.bh); - goto error_return; - } - aed = (struct allocExtDesc *)(epos.bh->b_data); - aed->previousAllocExtLocation = - cpu_to_le32(oepos.block.logicalBlockNum); - if (epos.offset + adsize > sb->s_blocksize) { - loffset = epos.offset; - aed->lengthAllocDescs = cpu_to_le32(adsize); - sptr = iinfo->i_ext.i_data + epos.offset - - adsize; - dptr = epos.bh->b_data + - sizeof(struct allocExtDesc); - memcpy(dptr, sptr, adsize); - epos.offset = sizeof(struct allocExtDesc) + - adsize; - } else { - loffset = epos.offset + adsize; - aed->lengthAllocDescs = cpu_to_le32(0); - if (oepos.bh) { - sptr = oepos.bh->b_data + epos.offset; - aed = (struct allocExtDesc *) - oepos.bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, - adsize); - } else { - sptr = iinfo->i_ext.i_data + - epos.offset; - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(table); - } - epos.offset = sizeof(struct allocExtDesc); - } - if (sbi->s_udfrev >= 0x0200) - udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, - 3, 1, epos.block.logicalBlockNum, - sizeof(struct tag)); - else - udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, - 2, 1, epos.block.logicalBlockNum, - sizeof(struct tag)); - - switch (iinfo->i_alloc_type) { - case ICBTAG_FLAG_AD_SHORT: - sad = (struct short_ad *)sptr; - sad->extLength = cpu_to_le32( - EXT_NEXT_EXTENT_ALLOCDECS | - sb->s_blocksize); - sad->extPosition = - cpu_to_le32(epos.block.logicalBlockNum); - break; - case ICBTAG_FLAG_AD_LONG: - lad = (struct long_ad *)sptr; - lad->extLength = cpu_to_le32( - EXT_NEXT_EXTENT_ALLOCDECS | - sb->s_blocksize); - lad->extLocation = - cpu_to_lelb(epos.block); - break; - } - if (oepos.bh) { - udf_update_tag(oepos.bh->b_data, loffset); - mark_buffer_dirty(oepos.bh); - } else { - mark_inode_dirty(table); - } } /* It's possible that stealing the block emptied the extent */ - if (elen) { - udf_write_aext(table, &epos, &eloc, elen, 1); - - if (!epos.bh) { - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(table); - } else { - aed = (struct allocExtDesc *)epos.bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, adsize); - udf_update_tag(epos.bh->b_data, epos.offset); - mark_buffer_dirty(epos.bh); - } - } + if (elen) + __udf_add_aext(table, &epos, &eloc, elen, 1); } brelse(epos.bh); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 055746350d16..87dc16d15572 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -539,9 +539,18 @@ static int udf_do_extend_file(struct inode *inode, udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); count++; - } else + } else { + struct kernel_lb_addr tmploc; + uint32_t tmplen; + udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); + /* + * We've rewritten the last extent but there may be empty + * indirect extent after it - enter it. + */ + udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); + } /* Managed to do everything necessary? */ if (!blocks) @@ -1867,22 +1876,90 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, return inode; } -int udf_add_aext(struct inode *inode, struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t elen, int inc) +int udf_setup_indirect_aext(struct inode *inode, int block, + struct extent_position *epos) { - int adsize; - struct short_ad *sad = NULL; - struct long_ad *lad = NULL; + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; struct allocExtDesc *aed; - uint8_t *ptr; - struct udf_inode_info *iinfo = UDF_I(inode); + struct extent_position nepos; + struct kernel_lb_addr neloc; + int ver, adsize; - if (!epos->bh) - ptr = iinfo->i_ext.i_data + epos->offset - - udf_file_entry_alloc_offset(inode) + - iinfo->i_lenEAttr; + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); else - ptr = epos->bh->b_data + epos->offset; + return -EIO; + + neloc.logicalBlockNum = block; + neloc.partitionReferenceNum = epos->block.partitionReferenceNum; + + bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); + if (!bh) + return -EIO; + lock_buffer(bh); + memset(bh->b_data, 0x00, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + + aed = (struct allocExtDesc *)(bh->b_data); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) { + aed->previousAllocExtLocation = + cpu_to_le32(epos->block.logicalBlockNum); + } + aed->lengthAllocDescs = cpu_to_le32(0); + if (UDF_SB(sb)->s_udfrev >= 0x0200) + ver = 3; + else + ver = 2; + udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block, + sizeof(struct tag)); + + nepos.block = neloc; + nepos.offset = sizeof(struct allocExtDesc); + nepos.bh = bh; + + /* + * Do we have to copy current last extent to make space for indirect + * one? + */ + if (epos->offset + adsize > sb->s_blocksize) { + struct kernel_lb_addr cp_loc; + uint32_t cp_len; + int cp_type; + + epos->offset -= adsize; + cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0); + cp_len |= ((uint32_t)cp_type) << 30; + + __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); + udf_write_aext(inode, epos, &nepos.block, + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + } else { + __udf_add_aext(inode, epos, &nepos.block, + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + } + + brelse(epos->bh); + *epos = nepos; + + return 0; +} + +/* + * Append extent at the given position - should be the first free one in inode + * / indirect extent. This function assumes there is enough space in the inode + * or indirect extent. Use udf_add_aext() if you didn't check for this before. + */ +int __udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + struct allocExtDesc *aed; + int adsize; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); @@ -1891,88 +1968,14 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos, else return -EIO; - if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { - unsigned char *sptr, *dptr; - struct buffer_head *nbh; - int err, loffset; - struct kernel_lb_addr obloc = epos->block; - - epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, - obloc.partitionReferenceNum, - obloc.logicalBlockNum, &err); - if (!epos->block.logicalBlockNum) - return -ENOSPC; - nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, - &epos->block, - 0)); - if (!nbh) - return -EIO; - lock_buffer(nbh); - memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(nbh); - unlock_buffer(nbh); - mark_buffer_dirty_inode(nbh, inode); - - aed = (struct allocExtDesc *)(nbh->b_data); - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) - aed->previousAllocExtLocation = - cpu_to_le32(obloc.logicalBlockNum); - if (epos->offset + adsize > inode->i_sb->s_blocksize) { - loffset = epos->offset; - aed->lengthAllocDescs = cpu_to_le32(adsize); - sptr = ptr - adsize; - dptr = nbh->b_data + sizeof(struct allocExtDesc); - memcpy(dptr, sptr, adsize); - epos->offset = sizeof(struct allocExtDesc) + adsize; - } else { - loffset = epos->offset + adsize; - aed->lengthAllocDescs = cpu_to_le32(0); - sptr = ptr; - epos->offset = sizeof(struct allocExtDesc); - - if (epos->bh) { - aed = (struct allocExtDesc *)epos->bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, adsize); - } else { - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(inode); - } - } - if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) - udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, - epos->block.logicalBlockNum, sizeof(struct tag)); - else - udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, - epos->block.logicalBlockNum, sizeof(struct tag)); - switch (iinfo->i_alloc_type) { - case ICBTAG_FLAG_AD_SHORT: - sad = (struct short_ad *)sptr; - sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | - inode->i_sb->s_blocksize); - sad->extPosition = - cpu_to_le32(epos->block.logicalBlockNum); - break; - case ICBTAG_FLAG_AD_LONG: - lad = (struct long_ad *)sptr; - lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | - inode->i_sb->s_blocksize); - lad->extLocation = cpu_to_lelb(epos->block); - memset(lad->impUse, 0x00, sizeof(lad->impUse)); - break; - } - if (epos->bh) { - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || - UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) - udf_update_tag(epos->bh->b_data, loffset); - else - udf_update_tag(epos->bh->b_data, - sizeof(struct allocExtDesc)); - mark_buffer_dirty_inode(epos->bh, inode); - brelse(epos->bh); - } else { - mark_inode_dirty(inode); - } - epos->bh = nbh; + if (!epos->bh) { + WARN_ON(iinfo->i_lenAlloc != + epos->offset - udf_file_entry_alloc_offset(inode)); + } else { + aed = (struct allocExtDesc *)epos->bh->b_data; + WARN_ON(le32_to_cpu(aed->lengthAllocDescs) != + epos->offset - sizeof(struct allocExtDesc)); + WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize); } udf_write_aext(inode, epos, eloc, elen, inc); @@ -1996,6 +1999,41 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos, return 0; } +/* + * Append extent at given position - should be the first free one in inode + * / indirect extent. Takes care of allocating and linking indirect blocks. + */ +int udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + int adsize; + struct super_block *sb = inode->i_sb; + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return -EIO; + + if (epos->offset + (2 * adsize) > sb->s_blocksize) { + int err; + int new_block; + + new_block = udf_new_block(sb, NULL, + epos->block.partitionReferenceNum, + epos->block.logicalBlockNum, &err); + if (!new_block) + return -ENOSPC; + + err = udf_setup_indirect_aext(inode, new_block, epos); + if (err) + return err; + } + + return __udf_add_aext(inode, epos, eloc, elen, inc); +} + void udf_write_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { @@ -2048,14 +2086,29 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos, epos->offset += adsize; } +/* + * Only 1 indirect extent in a row really makes sense but allow upto 16 in case + * someone does some weird stuff. + */ +#define UDF_MAX_INDIR_EXTS 16 + int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int inc) { int8_t etype; + unsigned int indirections = 0; while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { int block; + + if (++indirections > UDF_MAX_INDIR_EXTS) { + udf_err(inode->i_sb, + "too many indirect extents in inode %lu\n", + inode->i_ino); + return -1; + } + epos->block = *eloc; epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); diff --git a/fs/udf/super.c b/fs/udf/super.c index 81155b9b445b..0fbb4c7c72e8 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -179,7 +179,8 @@ static int __init init_inodecache(void) udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | + SLAB_ACCOUNT), init_once); if (!udf_inode_cachep) return -ENOMEM; @@ -1586,6 +1587,13 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ } /* + * Maximum number of Terminating Descriptor redirections. The chosen number is + * arbitrary - just that we hopefully don't limit any real use of rewritten + * inode on write-once media but avoid looping for too long on corrupted media. + */ +#define UDF_MAX_TD_NESTING 64 + +/* * Process a main/reserve volume descriptor sequence. * @block First block of first extent of the sequence. * @lastblock Lastblock of first extent of the sequence. @@ -1609,6 +1617,7 @@ static noinline int udf_process_sequence( uint16_t ident; long next_s = 0, next_e = 0; int ret; + unsigned int indirections = 0; memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); @@ -1679,6 +1688,12 @@ static noinline int udf_process_sequence( } break; case TAG_IDENT_TD: /* ISO 13346 3/10.9 */ + if (++indirections > UDF_MAX_TD_NESTING) { + udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING); + brelse(bh); + return -EIO; + } + vds[VDS_POS_TERMINATING_DESC].block = block; if (next_e) { block = next_s; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index ce169b49429d..fa0044b6b81d 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -158,6 +158,10 @@ extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); +extern int udf_setup_indirect_aext(struct inode *inode, int block, + struct extent_position *epos); +extern int __udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc); extern int udf_add_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); extern void udf_write_aext(struct inode *, struct extent_position *, diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index ab478e62baae..e788a05aab83 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -128,11 +128,15 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) if (c < 0x80U) utf_o->u_name[utf_o->u_len++] = (uint8_t)c; else if (c < 0x800U) { + if (utf_o->u_len > (UDF_NAME_LEN - 4)) + break; utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6)); utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); } else { + if (utf_o->u_len > (UDF_NAME_LEN - 5)) + break; utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12)); utf_o->u_name[utf_o->u_len++] = @@ -173,17 +177,22 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) { unsigned c, i, max_val, utf_char; - int utf_cnt, u_len; + int utf_cnt, u_len, u_ch; memset(ocu, 0, sizeof(dstring) * length); ocu[0] = 8; max_val = 0xffU; + u_ch = 1; try_again: u_len = 0U; utf_char = 0U; utf_cnt = 0U; for (i = 0U; i < utf->u_len; i++) { + /* Name didn't fit? */ + if (u_len + 1 + u_ch >= length) + return 0; + c = (uint8_t)utf->u_name[i]; /* Complete a multi-byte UTF-8 character */ @@ -225,6 +234,7 @@ try_again: if (max_val == 0xffU) { max_val = 0xffffU; ocu[0] = (uint8_t)0x10U; + u_ch = 2; goto try_again; } goto error_out; @@ -277,7 +287,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, c = (c << 8) | ocu[i++]; len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len], - UDF_NAME_LEN - utf_o->u_len); + UDF_NAME_LEN - 2 - utf_o->u_len); /* Valid character? */ if (len >= 0) utf_o->u_len += len; @@ -295,15 +305,19 @@ static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int len; unsigned i, max_val; uint16_t uni_char; - int u_len; + int u_len, u_ch; memset(ocu, 0, sizeof(dstring) * length); ocu[0] = 8; max_val = 0xffU; + u_ch = 1; try_again: u_len = 0U; for (i = 0U; i < uni->u_len; i++) { + /* Name didn't fit? */ + if (u_len + 1 + u_ch >= length) + return 0; len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); if (!len) continue; @@ -316,6 +330,7 @@ try_again: if (uni_char > max_val) { max_val = 0xffffU; ocu[0] = (uint8_t)0x10U; + u_ch = 2; goto try_again; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f6390eec02ca..442fd52ebffe 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1427,7 +1427,7 @@ static int __init init_inodecache(void) ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ufs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index cc6b768fc068..d1c66e465ca5 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) #define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN #define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT #define KM_ZONE_SPREAD SLAB_MEM_SPREAD +#define KM_ZONE_ACCOUNT SLAB_ACCOUNT #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3479294c1d58..a708e38b494c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -535,6 +535,7 @@ xfs_agfl_write_verify( } const struct xfs_buf_ops xfs_agfl_buf_ops = { + .name = "xfs_agfl", .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, }; @@ -1926,7 +1927,7 @@ xfs_alloc_space_available( * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ -STATIC int /* error */ +int /* error */ xfs_alloc_fix_freelist( struct xfs_alloc_arg *args, /* allocation argument structure */ int flags) /* XFS_ALLOC_FLAG_... */ @@ -2339,6 +2340,7 @@ xfs_agf_write_verify( } const struct xfs_buf_ops xfs_agf_buf_ops = { + .name = "xfs_agf", .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 0ecde4d5cac8..135eb3d24db7 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -235,5 +235,6 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 90de071dd4c2..444626ddbd1b 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -293,14 +293,7 @@ xfs_allocbt_verify( level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): @@ -311,14 +304,7 @@ xfs_allocbt_verify( return false; break; case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): @@ -332,21 +318,7 @@ xfs_allocbt_verify( return false; } - /* numrecs verification */ - if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; - - return true; + return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } static void @@ -379,6 +351,7 @@ xfs_allocbt_write_verify( } const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .name = "xfs_allocbt", .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index f949818fa1c7..fa3b948ef9c2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -207,7 +207,7 @@ xfs_attr_set( struct xfs_trans_res tres; xfs_fsblock_t firstblock; int rsvd = (flags & ATTR_ROOT) != 0; - int error, err2, committed, local; + int error, err2, local; XFS_STATS_INC(mp, xs_attr_set); @@ -334,25 +334,15 @@ xfs_attr_set( */ xfs_bmap_init(args.flist, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); - if (!error) { - error = xfs_bmap_finish(&args.trans, args.flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args.trans, args.flist, dp); if (error) { - ASSERT(committed); args.trans = NULL; xfs_bmap_cancel(&flist); goto out; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args.trans, dp, 0); - - /* * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ @@ -568,7 +558,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) { xfs_inode_t *dp; struct xfs_buf *bp; - int retval, error, committed, forkoff; + int retval, error, forkoff; trace_xfs_attr_leaf_addname(args); @@ -628,25 +618,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - /* * Commit the current trans (including the inode) and start * a new one. */ @@ -729,25 +709,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -775,7 +744,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) { xfs_inode_t *dp; struct xfs_buf *bp; - int error, committed, forkoff; + int error, forkoff; trace_xfs_attr_leaf_removename(args); @@ -803,23 +772,13 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } return 0; } @@ -877,7 +836,7 @@ xfs_attr_node_addname(xfs_da_args_t *args) xfs_da_state_blk_t *blk; xfs_inode_t *dp; xfs_mount_t *mp; - int committed, retval, error; + int retval, error; trace_xfs_attr_node_addname(args); @@ -938,27 +897,16 @@ restart: state = NULL; xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - /* * Commit the node conversion and start the next * trans in the chain. */ @@ -977,23 +925,13 @@ restart: */ xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_split(state); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } else { /* * Addition succeeded, update Btree hashvals. @@ -1086,25 +1024,14 @@ restart: if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_join(state); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -1146,7 +1073,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_da_state_blk_t *blk; xfs_inode_t *dp; struct xfs_buf *bp; - int retval, error, committed, forkoff; + int retval, error, forkoff; trace_xfs_attr_node_removename(args); @@ -1220,24 +1147,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_join(state); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - /* * Commit the Btree join operation and start a new trans. */ @@ -1265,25 +1181,14 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } else xfs_trans_brelse(args->trans, bp); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index aa187f7ba2dd..01a5ecfedfcf 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -328,6 +328,7 @@ xfs_attr3_leaf_read_verify( } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .name = "xfs_attr3_leaf", .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 5ab95ffa4ae9..a572532a55cd 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify( } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .name = "xfs_attr3_rmt", .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, }; @@ -447,8 +448,6 @@ xfs_attr_rmtval_set( * Roll through the "value", allocating blocks on disk as required. */ while (blkcnt > 0) { - int committed; - /* * Allocate a single extent, up to the size of the value. * @@ -466,24 +465,14 @@ xfs_attr_rmtval_set( error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, args->total, &map, &nmap, args->flist); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -614,31 +603,20 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; done = 0; while (!done) { - int committed; - xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, args->flist, &done); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + args->dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, args->dp, 0); - - /* * Close out trans and start the next one in the chain. */ error = xfs_trans_roll(&args->trans, args->dp); diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 0e8885a59646..0a94cce5ea35 100644 --- a/fs/xfs/libxfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c @@ -32,13 +32,13 @@ int xfs_bitmap_empty(uint *map, uint size) { uint i; - uint ret = 0; for (i = 0; i < size; i++) { - ret |= map[i]; + if (map[i] != 0) + return 0; } - return (ret == 0); + return 1; } /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 119c2422aac7..ef00156f4f96 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -325,9 +325,11 @@ xfs_check_block( /* * Check that the extents for the inode ip are in the right order in all - * btree leaves. + * btree leaves. THis becomes prohibitively expensive for large extent count + * files, so don't bother with inodes that have more than 10,000 extents in + * them. The btree record ordering checks will still be done, so for such large + * bmapbt constructs that is going to catch most corruptions. */ - STATIC void xfs_bmap_check_leaf_extents( xfs_btree_cur_t *cur, /* btree cursor or null */ @@ -352,6 +354,10 @@ xfs_bmap_check_leaf_extents( return; } + /* skip large extent count inodes */ + if (ip->i_d.di_nextents > 10000) + return; + bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -1111,7 +1117,6 @@ xfs_bmap_add_attrfork( xfs_trans_t *tp; /* transaction pointer */ int blks; /* space reservation */ int version = 1; /* superblock attr version */ - int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1214,7 +1219,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_bmap_finish(&tp, &flist, &committed); + error = xfs_bmap_finish(&tp, &flist, NULL); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -1723,10 +1728,11 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + int whichfork = XFS_DATA_FORK; struct xfs_mount *mp; - mp = bma->tp ? bma->tp->t_mountp : NULL; - ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + mp = bma->ip->i_mount; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); @@ -1785,7 +1791,7 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); @@ -2016,10 +2022,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, - &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); + &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2100,10 +2106,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, - &tmp_rval, XFS_DATA_FORK); + &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2169,10 +2175,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, - 1, &tmp_rval, XFS_DATA_FORK); + 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2215,13 +2221,13 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, - da_old > 0, &tmp_logflags, XFS_DATA_FORK); + da_old > 0, &tmp_logflags, whichfork); bma->logflags |= tmp_logflags; if (error) goto done; @@ -2242,7 +2248,7 @@ xfs_bmap_add_extent_delay_real( if (bma->cur) bma->cur->bc_private.b.allocated = 0; - xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: bma->logflags |= rval; return error; @@ -2939,7 +2945,7 @@ xfs_bmap_add_extent_hole_real( int state; /* state bits, accessed thru macros */ struct xfs_mount *mp; - mp = bma->tp ? bma->tp->t_mountp : NULL; + mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); @@ -5950,7 +5956,6 @@ xfs_bmap_split_extent( struct xfs_trans *tp; struct xfs_bmap_free free_list; xfs_fsblock_t firstfsb; - int committed; int error; tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); @@ -5971,7 +5976,7 @@ xfs_bmap_split_extent( if (error) goto out; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index a160f8a5a3fc..423a34e832bd 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -195,7 +195,7 @@ void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_bmap_free *flist, struct xfs_mount *mp); void xfs_bmap_cancel(struct xfs_bmap_free *flist); int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, - int *committed); + struct xfs_inode *ip); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6b0cf6546a82..1637c37bfbaa 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -720,6 +720,7 @@ xfs_bmbt_write_verify( } const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .name = "xfs_bmbt", .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index af1bbee5586e..a0eb18ce3ad3 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4080,3 +4080,61 @@ xfs_btree_change_owner( return 0; } + +/** + * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format + * btree block + * + * @bp: buffer containing the btree block + * @max_recs: pointer to the m_*_mxr max records field in the xfs mount + * @pag_max_level: pointer to the per-ag max level field + */ +bool +xfs_btree_sblock_v5hdr_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + return true; +} + +/** + * xfs_btree_sblock_verify() -- verify a short-format btree block + * + * @bp: buffer containing the btree block + * @max_recs: maximum records allowed in this btree node + */ +bool +xfs_btree_sblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 992dec0638f3..2e874be70209 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -472,4 +472,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_BTREE_TRACE_ARGR(c, r) #define XFS_BTREE_TRACE_CURSOR(c, t) +bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); +bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e89a0f8f827c..097bf7717d80 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -245,6 +245,7 @@ xfs_da3_node_read_verify( } const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .name = "xfs_da3_node", .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9c10e2b8cfcb..aa17cb788946 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -123,6 +123,7 @@ xfs_dir3_block_write_verify( } const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .name = "xfs_dir3_block", .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index af71a84f343c..725fc7841fde 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -305,11 +305,13 @@ xfs_dir3_data_write_verify( } const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .name = "xfs_dir3_data", .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, }; static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .name = "xfs_dir3_data_reada", .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 3923e1f94697..b887fb2a2bcf 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -245,11 +245,13 @@ xfs_dir3_leafn_write_verify( } const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .name = "xfs_dir3_leaf1", .verify_read = xfs_dir3_leaf1_read_verify, .verify_write = xfs_dir3_leaf1_write_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .name = "xfs_dir3_leafn", .verify_read = xfs_dir3_leafn_read_verify, .verify_write = xfs_dir3_leafn_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 70b0cb2fd556..63ee03db796c 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -150,6 +150,7 @@ xfs_dir3_free_write_verify( } const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .name = "xfs_dir3_free", .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 5331b7f0460c..3cc3cf767474 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -54,7 +54,7 @@ xfs_dqcheck( xfs_dqid_t id, uint type, /* used only when IO_dorepair is true */ uint flags, - char *str) + const char *str) { xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; int errs = 0; @@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc( STATIC bool xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + int warn) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; xfs_dqid_t id = 0; @@ -240,8 +241,7 @@ xfs_dquot_buf_verify( if (i == 0) id = be32_to_cpu(ddq->d_id); - error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_buf_verify"); + error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__); if (error) return false; } @@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify( if (!xfs_dquot_buf_verify_crc(mp, bp)) xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dquot_buf_verify(mp, bp)) + else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) @@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify( } /* + * readahead errors are silent and simply leave the buffer as !done so a real + * read will then be run with the xfs_dquot_buf_ops verifier. See + * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than + * reporting the failure. + */ +static void +xfs_dquot_buf_readahead_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp) || + !xfs_dquot_buf_verify(mp, bp, 0)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + } +} + +/* * we don't calculate the CRC here as that is done when the dquot is flushed to * the buffer after the update is done. This ensures that the dquot in the * buffer always has an up-to-date CRC value. @@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if (!xfs_dquot_buf_verify(mp, bp)) { + if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) { xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; @@ -282,7 +301,13 @@ xfs_dquot_buf_write_verify( } const struct xfs_buf_ops xfs_dquot_buf_ops = { + .name = "xfs_dquot", .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, }; +const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { + .name = "xfs_dquot_ra", + .verify_read = xfs_dquot_buf_readahead_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 8774498ce0ff..e2536bb1c760 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -786,7 +786,7 @@ typedef struct xfs_agfl { __be64 agfl_lsn; __be32 agfl_crc; __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ -} xfs_agfl_t; +} __attribute__((packed)) xfs_agfl_t; #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 70c1db99f6a7..66d702e6b9ff 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2572,6 +2572,7 @@ xfs_agi_write_verify( } const struct xfs_buf_ops xfs_agi_buf_ops = { + .name = "xfs_agi", .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index f39b285beb19..c679f3c05b63 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -221,7 +221,6 @@ xfs_inobt_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_perag *pag = bp->b_pag; unsigned int level; /* @@ -237,14 +236,7 @@ xfs_inobt_verify( switch (block->bb_magic) { case cpu_to_be32(XFS_IBT_CRC_MAGIC): case cpu_to_be32(XFS_FIBT_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_IBT_MAGIC): @@ -254,24 +246,12 @@ xfs_inobt_verify( return 0; } - /* numrecs and level verification */ + /* level verification */ level = be16_to_cpu(block->bb_level); if (level >= mp->m_in_maxlevels) return false; - if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; - return true; + return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]); } static void @@ -304,6 +284,7 @@ xfs_inobt_write_verify( } const struct xfs_buf_ops xfs_inobt_buf_ops = { + .name = "xfs_inobt", .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 268c00f4f83a..1aabfda669b0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -62,11 +62,14 @@ xfs_inobp_check( * has not had the inode cores stamped into it. Hence for readahead, the buffer * may be potentially invalid. * - * If the readahead buffer is invalid, we don't want to mark it with an error, - * but we do want to clear the DONE status of the buffer so that a followup read - * will re-read it from disk. This will ensure that we don't get an unnecessary - * warnings during log recovery and we don't get unnecssary panics on debug - * kernels. + * If the readahead buffer is invalid, we need to mark it with an error and + * clear the DONE status of the buffer so that a followup read will re-read it + * from disk. We don't report the error otherwise to avoid warnings during log + * recovery and we don't get unnecssary panics on debug kernels. We use EIO here + * because all we want to do is say readahead failed; there is no-one to report + * the error to, so this will distinguish it from a non-ra verifier failure. + * Changes to this readahead error behavour also need to be reflected in + * xfs_dquot_buf_readahead_verify(). */ static void xfs_inode_buf_verify( @@ -93,6 +96,7 @@ xfs_inode_buf_verify( XFS_RANDOM_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; + xfs_buf_ioerror(bp, -EIO); return; } @@ -132,11 +136,13 @@ xfs_inode_buf_write_verify( } const struct xfs_buf_ops xfs_inode_buf_ops = { + .name = "xfs_inode", .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .name = "xxfs_inode_ra", .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 1c55ccbb379d..8e385f91d660 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -60,6 +60,7 @@ typedef struct xlog_recover { */ #define XLOG_BC_TABLE_SIZE 64 +#define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 1b0a08379759..f51078f1e92a 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, - xfs_dqid_t id, uint type, uint flags, char *str); + xfs_dqid_t id, uint type, uint flags, const char *str); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a0b071d881a0..8a53eaa349f4 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -679,11 +679,13 @@ xfs_sb_write_verify( } const struct xfs_buf_ops xfs_sb_buf_ops = { + .name = "xfs_sb", .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .name = "xfs_sb_quiet", .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 5be529707903..15c3ceb845b9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index cb6fd20a4d3d..2e2c6716b623 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -168,6 +168,7 @@ xfs_symlink_write_verify( } const struct xfs_buf_ops xfs_symlink_buf_ops = { + .name = "xfs_symlink", .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, }; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 29e7e5dd5178..379c089fb051 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1917,6 +1917,7 @@ xfs_vm_readpage( struct file *unused, struct page *page) { + trace_xfs_vm_readpage(page->mapping->host, 1); return mpage_readpage(page, xfs_get_blocks); } @@ -1927,6 +1928,7 @@ xfs_vm_readpages( struct list_head *pages, unsigned nr_pages) { + trace_xfs_vm_readpages(mapping->host, nr_pages); return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index dbae6490a79a..45ec9e40150c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -91,32 +91,32 @@ xfs_zero_extent( * last due to locking considerations. We never free any extents in * the first transaction. * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. + * If an inode *ip is provided, rejoin it to the transaction if + * the transaction was committed. */ int /* error */ xfs_bmap_finish( struct xfs_trans **tp, /* transaction pointer addr */ struct xfs_bmap_free *flist, /* i/o: list extents to free */ - int *committed)/* xact committed or not */ + struct xfs_inode *ip) { struct xfs_efd_log_item *efd; /* extent free data */ struct xfs_efi_log_item *efi; /* extent free intention */ int error; /* error return value */ + int committed;/* xact committed or not */ struct xfs_bmap_free_item *free; /* free extent item */ struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; + if (flist->xbf_count == 0) return 0; - } + efi = xfs_trans_get_efi(*tp, flist->xbf_count); for (free = flist->xbf_first; free; free = free->xbfi_next) xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - error = __xfs_trans_roll(tp, NULL, committed); + error = __xfs_trans_roll(tp, ip, &committed); if (error) { /* * If the transaction was committed, drop the EFD reference @@ -128,16 +128,13 @@ xfs_bmap_finish( * transaction so we should return committed=1 even though we're * returning an error. */ - if (*committed) { + if (committed) { xfs_efi_release(efi); xfs_force_shutdown((*tp)->t_mountp, (error == -EFSCORRUPTED) ? SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR); - } else { - *committed = 1; } - return error; } @@ -969,7 +966,6 @@ xfs_alloc_file_space( xfs_bmbt_irec_t imaps[1], *imapp; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; - int committed; int error; trace_xfs_alloc_file_space(ip); @@ -1064,23 +1060,20 @@ xfs_alloc_file_space( error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, &firstfsb, resblks, imapp, &nimaps, &free_list); - if (error) { + if (error) goto error0; - } /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) goto error0; - } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) { + if (error) break; - } allocated_fsb = imapp->br_blockcount; @@ -1206,7 +1199,6 @@ xfs_free_file_space( xfs_off_t offset, xfs_off_t len) { - int committed; int done; xfs_fileoff_t endoffset_fsb; int error; @@ -1346,17 +1338,15 @@ xfs_free_file_space( error = xfs_bunmapi(tp, ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, 0, 2, &firstfsb, &free_list, &done); - if (error) { + if (error) goto error0; - } /* * complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) goto error0; - } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1434,7 +1424,6 @@ xfs_shift_file_space( int error; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; - int committed; xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; @@ -1526,7 +1515,7 @@ xfs_shift_file_space( if (error) goto out_bmap_cancel; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ace91e7c713e..daed4bfb85b2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -604,6 +604,13 @@ found: } } + /* + * Clear b_error if this is a lookup from a caller that doesn't expect + * valid data to be found in the buffer. + */ + if (!(flags & XBF_READ)) + xfs_buf_ioerror(bp, 0); + XFS_STATS_INC(target->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; @@ -1045,7 +1052,7 @@ xfs_buf_ioend_work( xfs_buf_ioend(bp); } -void +static void xfs_buf_ioend_async( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c79b717d9b88..c75721acd867 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -132,6 +132,7 @@ struct xfs_buf_map { struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; struct xfs_buf_ops { + char *name; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); }; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7ac6c5c586cb..9c44d38dcd1f 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -306,7 +306,7 @@ xfs_qm_dqalloc( xfs_fsblock_t firstblock; xfs_bmap_free_t flist; xfs_bmbt_irec_t map; - int nmaps, error, committed; + int nmaps, error; xfs_buf_t *bp; xfs_trans_t *tp = *tpp; @@ -379,11 +379,12 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { + error = xfs_bmap_finish(tpp, &flist, NULL); + if (error) goto error1; - } - if (committed) { + /* Transaction was committed? */ + if (*tpp != tp) { tp = *tpp; xfs_trans_bjoin(tp, bp); } else { @@ -393,9 +394,9 @@ xfs_qm_dqalloc( *O_bpp = bp; return 0; - error1: +error1: xfs_bmap_cancel(&flist); - error0: +error0: xfs_iunlock(quotip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 74d0e5966ebc..88693a98fac5 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -164,9 +164,9 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - __return_address, bp->b_bn); + __return_address, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f5392ab2def1..ebe9b8290a70 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -402,19 +402,26 @@ xfs_file_splice_read( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - /* for dax, we need to avoid the page cache */ - if (IS_DAX(VFS_I(ip))) - ret = default_file_splice_read(infilp, ppos, pipe, count, flags); - else - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - if (ret > 0) - XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); + /* + * DAX inodes cannot ues the page cache for splice, so we have to push + * them through the VFS IO path. This means it goes through + * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we + * cannot lock the splice operation at this level for DAX inodes. + */ + if (IS_DAX(VFS_I(ip))) { + ret = default_file_splice_read(infilp, ppos, pipe, count, + flags); + goto out; + } + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); +out: + if (ret > 0) + XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); return ret; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8ee393996b7d..ae3758a90ed6 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1143,7 +1143,6 @@ xfs_create( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - int committed; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1226,7 +1225,7 @@ xfs_create( * pointing to itself. */ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, &committed); + prid, resblks > 0, &ip, NULL); if (error) goto out_trans_cancel; @@ -1275,7 +1274,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -1427,7 +1426,6 @@ xfs_link( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; int resblks; trace_xfs_link(tdp, target_name); @@ -1502,11 +1500,10 @@ xfs_link( * link transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - } - error = xfs_bmap_finish (&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) { xfs_bmap_cancel(&free_list); goto error_return; @@ -1555,7 +1552,6 @@ xfs_itruncate_extents( xfs_fileoff_t first_unmap_block; xfs_fileoff_t last_block; xfs_filblks_t unmap_len; - int committed; int error = 0; int done = 0; @@ -1601,9 +1597,7 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (committed) - xfs_trans_ijoin(tp, ip, 0); + error = xfs_bmap_finish(&tp, &free_list, ip); if (error) goto out_bmap_cancel; @@ -1774,7 +1768,6 @@ xfs_inactive_ifree( { xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; @@ -1841,7 +1834,7 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) { xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); @@ -2523,7 +2516,6 @@ xfs_remove( int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; uint resblks; trace_xfs_remove(dp, name); @@ -2624,7 +2616,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -2701,7 +2693,6 @@ xfs_finish_rename( struct xfs_trans *tp, struct xfs_bmap_free *free_list) { - int committed = 0; int error; /* @@ -2711,7 +2702,7 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, free_list, &committed); + error = xfs_bmap_finish(&tp, free_list, NULL); if (error) { xfs_bmap_cancel(free_list); xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f4f5b43cf647..d81bdc080370 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -129,7 +129,6 @@ xfs_iomap_write_direct( xfs_trans_t *tp; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; - int committed; int error; int lockmode; int bmapi_flags = XFS_BMAPI_PREALLOC; @@ -203,15 +202,20 @@ xfs_iomap_write_direct( * this outside the transaction context, but if we commit and then crash * we may not have zeroed the blocks and this will be exposed on * recovery of the allocation. Hence we must zero before commit. + * * Further, if we are mapping unwritten extents here, we need to zero * and convert them to written so that we don't need an unwritten extent * callback for DAX. This also means that we need to be able to dip into - * the reserve block pool if there is no space left but we need to do - * unwritten extent conversion. + * the reserve block pool for bmbt block allocation if there is no space + * left but we need to do unwritten extent conversion. */ + if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; - tp->t_flags |= XFS_TRANS_RESERVE; + if (ISUNWRITTEN(imap)) { + tp->t_flags |= XFS_TRANS_RESERVE; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + } } error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, resrtextents); @@ -247,7 +251,7 @@ xfs_iomap_write_direct( /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -693,7 +697,7 @@ xfs_iomap_write_allocate( xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; xfs_trans_t *tp; - int nimaps, committed; + int nimaps; int error = 0; int nres; @@ -794,7 +798,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto trans_cancel; @@ -852,7 +856,6 @@ xfs_iomap_write_unwritten( xfs_bmap_free_t free_list; xfs_fsize_t i_size; uint resblks; - int committed; int error; trace_xfs_unwritten_convert(ip, offset, count); @@ -924,7 +927,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f52c72a1a06f..9c9a1c9bcc7f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1188,10 +1188,16 @@ xlog_iodone(xfs_buf_t *bp) int aborted = 0; /* - * Race to shutdown the filesystem if we see an error. + * Race to shutdown the filesystem if we see an error or the iclog is in + * IOABORT state. The IOABORT state is only set in DEBUG mode to inject + * CRC errors into log recovery. */ - if (XFS_TEST_ERROR(bp->b_error, l->l_mp, - XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, + XFS_RANDOM_IODONE_IOERR) || + iclog->ic_state & XLOG_STATE_IOABORT) { + if (iclog->ic_state & XLOG_STATE_IOABORT) + iclog->ic_state &= ~XLOG_STATE_IOABORT; + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); @@ -1838,6 +1844,23 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, iclog->ic_datap, size); +#ifdef DEBUG + /* + * Intentionally corrupt the log record CRC based on the error injection + * frequency, if defined. This facilitates testing log recovery in the + * event of torn writes. Hence, set the IOABORT state to abort the log + * write on I/O completion and shutdown the fs. The subsequent mount + * detects the bad CRC and attempts to recover. + */ + if (log->l_badcrc_factor && + (prandom_u32() % log->l_badcrc_factor == 0)) { + iclog->ic_header.h_crc &= 0xAAAAAAAA; + iclog->ic_state |= XLOG_STATE_IOABORT; + xfs_warn(log->l_mp, + "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", + be64_to_cpu(iclog->ic_header.h_lsn)); + } +#endif bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; @@ -2045,12 +2068,14 @@ xlog_print_tic_res( "QM_DQCLUSTER", "QM_QINOCREATE", "QM_QUOTAOFF_END", - "SB_UNIT", "FSYNC_TS", "GROWFSRT_ALLOC", "GROWFSRT_ZERO", "GROWFSRT_FREE", - "SWAPEXT" + "SWAPEXT", + "CHECKPOINT", + "ICREATE", + "CREATE_TMPFILE" }; xfs_warn(mp, "xlog_write: reservation summary:"); @@ -2791,11 +2816,19 @@ xlog_state_do_callback( } } while (!ioerrors && loopdidcallbacks); +#ifdef DEBUG /* - * make one last gasp attempt to see if iclogs are being left in - * limbo.. + * Make one last gasp attempt to see if iclogs are being left in limbo. + * If the above loop finds an iclog earlier than the current iclog and + * in one of the syncing states, the current iclog is put into + * DO_CALLBACK and the callbacks are deferred to the completion of the + * earlier iclog. Walk the iclogs in order and make sure that no iclog + * is in DO_CALLBACK unless an earlier iclog is in one of the syncing + * states. + * + * Note that SYNCING|IOABORT is a valid state so we cannot just check + * for ic_state == SYNCING. */ -#ifdef DEBUG if (funcdidcallbacks) { first_iclog = iclog = log->l_iclog; do { @@ -2810,7 +2843,7 @@ xlog_state_do_callback( * IOERROR - give up hope all ye who enter here */ if (iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state & XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_DONE_SYNC || iclog->ic_state == XLOG_STATE_IOERROR ) break; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8daba7491b13..ed8896310c00 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -62,6 +62,7 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ #define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ #define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ #define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ #define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ @@ -410,6 +411,8 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG void *l_iclog_bak[XLOG_MAX_ICLOGS]; + /* log record crc error injection factor */ + uint32_t l_badcrc_factor; #endif }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c5ecaacdd218..da37beb76f6e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -61,6 +61,9 @@ xlog_recover_check_summary( #else #define xlog_recover_check_summary(log) #endif +STATIC int +xlog_do_recovery_pass( + struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); /* * This structure is used during recovery to record the buf log items which @@ -868,6 +871,351 @@ validate_head: } /* + * Seek backwards in the log for log record headers. + * + * Given a starting log block, walk backwards until we find the provided number + * of records or hit the provided tail block. The return value is the number of + * records encountered or a negative error code. The log block and buffer + * pointer of the last record seen are returned in rblk and rhead respectively. + */ +STATIC int +xlog_rseek_logrec_hdr( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int count, + struct xfs_buf *bp, + xfs_daddr_t *rblk, + struct xlog_rec_header **rhead, + bool *wrapped) +{ + int i; + int error; + int found = 0; + char *offset = NULL; + xfs_daddr_t end_blk; + + *wrapped = false; + + /* + * Walk backwards from the head block until we hit the tail or the first + * block in the log. + */ + end_blk = head_blk > tail_blk ? tail_blk : 0; + for (i = (int) head_blk - 1; i >= end_blk; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + + /* + * If we haven't hit the tail block or the log record header count, + * start looking again from the end of the physical log. Note that + * callers can pass head == tail if the tail is not yet known. + */ + if (tail_blk >= head_blk && found != count) { + for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *wrapped = true; + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + } + + return found; + +out_error: + return error; +} + +/* + * Seek forward in the log for log record headers. + * + * Given head and tail blocks, walk forward from the tail block until we find + * the provided number of records or hit the head block. The return value is the + * number of records encountered or a negative error code. The log block and + * buffer pointer of the last record seen are returned in rblk and rhead + * respectively. + */ +STATIC int +xlog_seek_logrec_hdr( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int count, + struct xfs_buf *bp, + xfs_daddr_t *rblk, + struct xlog_rec_header **rhead, + bool *wrapped) +{ + int i; + int error; + int found = 0; + char *offset = NULL; + xfs_daddr_t end_blk; + + *wrapped = false; + + /* + * Walk forward from the tail block until we hit the head or the last + * block in the log. + */ + end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1; + for (i = (int) tail_blk; i <= end_blk; i++) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + + /* + * If we haven't hit the head block or the log record header count, + * start looking again from the start of the physical log. + */ + if (tail_blk > head_blk && found != count) { + for (i = 0; i < (int) head_blk; i++) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *wrapped = true; + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + } + + return found; + +out_error: + return error; +} + +/* + * Check the log tail for torn writes. This is required when torn writes are + * detected at the head and the head had to be walked back to a previous record. + * The tail of the previous record must now be verified to ensure the torn + * writes didn't corrupt the previous tail. + * + * Return an error if CRC verification fails as recovery cannot proceed. + */ +STATIC int +xlog_verify_tail( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + struct xlog_rec_header *thead; + struct xfs_buf *bp; + xfs_daddr_t first_bad; + int count; + int error = 0; + bool wrapped; + xfs_daddr_t tmp_head; + + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + + /* + * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get + * a temporary head block that points after the last possible + * concurrently written record of the tail. + */ + count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, + XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, + &wrapped); + if (count < 0) { + error = count; + goto out; + } + + /* + * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran + * into the actual log head. tmp_head points to the start of the record + * so update it to the actual head block. + */ + if (count < XLOG_MAX_ICLOGS + 1) + tmp_head = head_blk; + + /* + * We now have a tail and temporary head block that covers at least + * XLOG_MAX_ICLOGS records from the tail. We need to verify that these + * records were completely written. Run a CRC verification pass from + * tail to head and return the result. + */ + error = xlog_do_recovery_pass(log, tmp_head, tail_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Detect and trim torn writes from the head of the log. + * + * Storage without sector atomicity guarantees can result in torn writes in the + * log in the event of a crash. Our only means to detect this scenario is via + * CRC verification. While we can't always be certain that CRC verification + * failure is due to a torn write vs. an unrelated corruption, we do know that + * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at + * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of + * the log and treat failures in this range as torn writes as a matter of + * policy. In the event of CRC failure, the head is walked back to the last good + * record in the log and the tail is updated from that record and verified. + */ +STATIC int +xlog_verify_head( + struct xlog *log, + xfs_daddr_t *head_blk, /* in/out: unverified head */ + xfs_daddr_t *tail_blk, /* out: tail block */ + struct xfs_buf *bp, + xfs_daddr_t *rhead_blk, /* start blk of last record */ + struct xlog_rec_header **rhead, /* ptr to last record */ + bool *wrapped) /* last rec. wraps phys. log */ +{ + struct xlog_rec_header *tmp_rhead; + struct xfs_buf *tmp_bp; + xfs_daddr_t first_bad; + xfs_daddr_t tmp_rhead_blk; + int found; + int error; + bool tmp_wrapped; + + /* + * Search backwards through the log looking for the log record header + * block. This wraps all the way back around to the head so something is + * seriously wrong if we can't find it. + */ + found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk, + rhead, wrapped); + if (found < 0) + return found; + if (!found) { + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + return -EIO; + } + + *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); + + /* + * Now that we have a tail block, check the head of the log for torn + * writes. Search again until we hit the tail or the maximum number of + * log record I/Os that could have been in flight at one time. Use a + * temporary buffer so we don't trash the rhead/bp pointer from the + * call above. + */ + tmp_bp = xlog_get_bp(log, 1); + if (!tmp_bp) + return -ENOMEM; + error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, + XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk, + &tmp_rhead, &tmp_wrapped); + xlog_put_bp(tmp_bp); + if (error < 0) + return error; + + /* + * Now run a CRC verification pass over the records starting at the + * block found above to the current head. If a CRC failure occurs, the + * log block of the first bad record is saved in first_bad. + */ + error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + if (error == -EFSBADCRC) { + /* + * We've hit a potential torn write. Reset the error and warn + * about it. + */ + error = 0; + xfs_warn(log->l_mp, +"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.", + first_bad, *head_blk); + + /* + * Get the header block and buffer pointer for the last good + * record before the bad record. + * + * Note that xlog_find_tail() clears the blocks at the new head + * (i.e., the records with invalid CRC) if the cycle number + * matches the the current cycle. + */ + found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp, + rhead_blk, rhead, wrapped); + if (found < 0) + return found; + if (found == 0) /* XXX: right thing to do here? */ + return -EIO; + + /* + * Reset the head block to the starting block of the first bad + * log record and set the tail block based on the last good + * record. + * + * Bail out if the updated head/tail match as this indicates + * possible corruption outside of the acceptable + * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair... + */ + *head_blk = first_bad; + *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); + if (*head_blk == *tail_blk) { + ASSERT(0); + return 0; + } + + /* + * Now verify the tail based on the updated head. This is + * required because the torn writes trimmed from the head could + * have been written over the tail of a previous record. Return + * any errors since recovery cannot proceed if the tail is + * corrupt. + * + * XXX: This leaves a gap in truly robust protection from torn + * writes in the log. If the head is behind the tail, the tail + * pushes forward to create some space and then a crash occurs + * causing the writes into the previous record's tail region to + * tear, log recovery isn't able to recover. + * + * How likely is this to occur? If possible, can we do something + * more intelligent here? Is it safe to push the tail forward if + * we can determine that the tail is within the range of the + * torn write (e.g., the kernel can only overwrite the tail if + * it has actually been pushed forward)? Alternatively, could we + * somehow prevent this condition at runtime? + */ + error = xlog_verify_tail(log, *head_blk, *tail_blk); + } + + return error; +} + +/* * Find the sync block number or the tail of the log. * * This will be the block number of the last record to have its @@ -893,13 +1241,13 @@ xlog_find_tail( xlog_op_header_t *op_head; char *offset = NULL; xfs_buf_t *bp; - int error, i, found; + int error; xfs_daddr_t umount_data_blk; xfs_daddr_t after_umount_blk; + xfs_daddr_t rhead_blk; xfs_lsn_t tail_lsn; int hblks; - - found = 0; + bool wrapped = false; /* * Find previous log record @@ -923,48 +1271,16 @@ xlog_find_tail( } /* - * Search backwards looking for log record header block + * Trim the head block back to skip over torn records. We can have + * multiple log I/Os in flight at any time, so we assume CRC failures + * back through the previous several records are torn writes and skip + * them. */ ASSERT(*head_blk < INT_MAX); - for (i = (int)(*head_blk) - 1; i >= 0; i--) { - error = xlog_bread(log, i, 1, bp, &offset); - if (error) - goto done; - - if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { - found = 1; - break; - } - } - /* - * If we haven't found the log record header block, start looking - * again from the end of the physical log. XXXmiken: There should be - * a check here to make sure we didn't search more than N blocks in - * the previous code. - */ - if (!found) { - for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { - error = xlog_bread(log, i, 1, bp, &offset); - if (error) - goto done; - - if (*(__be32 *)offset == - cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { - found = 2; - break; - } - } - } - if (!found) { - xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); - xlog_put_bp(bp); - ASSERT(0); - return -EIO; - } - - /* find blk_no of tail of log */ - rhead = (xlog_rec_header_t *)offset; - *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); + error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk, + &rhead, &wrapped); + if (error) + goto done; /* * Reset log values according to the state of the log when we @@ -976,10 +1292,10 @@ xlog_find_tail( * written was complete and ended exactly on the end boundary * of the physical log. */ - log->l_prev_block = i; + log->l_prev_block = rhead_blk; log->l_curr_block = (int)*head_blk; log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); - if (found == 2) + if (wrapped) log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); @@ -1014,12 +1330,13 @@ xlog_find_tail( } else { hblks = 1; } - after_umount_blk = (i + hblks + (int) - BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; + after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); + after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); tail_lsn = atomic64_read(&log->l_tail_lsn); if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { - umount_data_blk = (i + hblks) % log->l_logBBsize; + umount_data_blk = rhead_blk + hblks; + umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) goto done; @@ -3204,6 +3521,7 @@ xlog_recover_dquot_ra_pass2( struct xfs_disk_dquot *recddq; struct xfs_dq_logformat *dq_f; uint type; + int len; if (mp->m_qflags == 0) @@ -3224,8 +3542,12 @@ xlog_recover_dquot_ra_pass2( ASSERT(dq_f); ASSERT(dq_f->qlf_len == 1); - xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); + len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); + if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, + &xfs_dquot_buf_ra_ops); } STATIC void @@ -4118,26 +4440,69 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } +STATIC int +xlog_unpack_data( + struct xlog_rec_header *rhead, + char *dp, + struct xlog *log) +{ + int i, j, k; + + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; + for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + dp += BBSIZE; + } + } + + return 0; +} + /* - * Upack the log buffer data and crc check it. If the check fails, issue a - * warning if and only if the CRC in the header is non-zero. This makes the - * check an advisory warning, and the zero CRC check will prevent failure - * warnings from being emitted when upgrading the kernel from one that does not - * add CRCs by default. - * - * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log - * corruption failure + * CRC check, unpack and process a log record. */ STATIC int -xlog_unpack_data_crc( +xlog_recover_process( + struct xlog *log, + struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - struct xlog *log) + int pass) { + int error; __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); - if (crc != rhead->h_crc) { + + /* + * Nothing else to do if this is a CRC verification pass. Just return + * if this a record with a non-zero crc. Unfortunately, mkfs always + * sets h_crc to 0 so we must consider this valid even on v5 supers. + * Otherwise, return EFSBADCRC on failure so the callers up the stack + * know precisely what failed. + */ + if (pass == XLOG_RECOVER_CRCPASS) { + if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc)) + return -EFSBADCRC; + return 0; + } + + /* + * We're in the normal recovery path. Issue a warning if and only if the + * CRC in the header is non-zero. This is an advisory warning and the + * zero CRC check prevents warnings from being emitted when upgrading + * the kernel from one that does not add CRCs by default. + */ + if (crc != le32_to_cpu(rhead->h_crc)) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", @@ -4147,47 +4512,18 @@ xlog_unpack_data_crc( } /* - * If we've detected a log record corruption, then we can't - * recover past this point. Abort recovery if we are enforcing - * CRC protection by punting an error back up the stack. + * If the filesystem is CRC enabled, this mismatch becomes a + * fatal log corruption failure. */ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) return -EFSCORRUPTED; } - return 0; -} - -STATIC int -xlog_unpack_data( - struct xlog_rec_header *rhead, - char *dp, - struct xlog *log) -{ - int i, j, k; - int error; - - error = xlog_unpack_data_crc(rhead, dp, log); + error = xlog_unpack_data(rhead, dp, log); if (error) return error; - for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; - dp += BBSIZE; - } - } - - return 0; + return xlog_recover_process_data(log, rhash, rhead, dp, pass); } STATIC int @@ -4239,18 +4575,21 @@ xlog_do_recovery_pass( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, - int pass) + int pass, + xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; + xfs_daddr_t rhead_blk; char *offset; xfs_buf_t *hbp, *dbp; - int error = 0, h_size; + int error = 0, h_size, h_len; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; struct hlist_head rhash[XLOG_RHASH_SIZE]; ASSERT(head_blk != tail_blk); + rhead_blk = 0; /* * Read the header of the tail block and get the iclog buffer size from @@ -4274,7 +4613,31 @@ xlog_do_recovery_pass( error = xlog_valid_rec_header(log, rhead, tail_blk); if (error) goto bread_err1; + + /* + * xfsprogs has a bug where record length is based on lsunit but + * h_size (iclog size) is hardcoded to 32k. Now that we + * unconditionally CRC verify the unmount record, this means the + * log buffer can be too small for the record and cause an + * overrun. + * + * Detect this condition here. Use lsunit for the buffer size as + * long as this looks like the mkfs case. Otherwise, return an + * error to avoid a buffer overrun. + */ h_size = be32_to_cpu(rhead->h_size); + h_len = be32_to_cpu(rhead->h_len); + if (h_len > h_size) { + if (h_len <= log->l_mp->m_logbsize && + be32_to_cpu(rhead->h_num_logops) == 1) { + xfs_warn(log->l_mp, + "invalid iclog size (%d bytes), using lsunit (%d bytes)", + h_size, log->l_mp->m_logbsize); + h_size = log->l_mp->m_logbsize; + } else + return -EFSCORRUPTED; + } + if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && (h_size > XLOG_HEADER_CYCLE_SIZE)) { hblks = h_size / XLOG_HEADER_CYCLE_SIZE; @@ -4301,7 +4664,7 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - blk_no = tail_blk; + blk_no = rhead_blk = tail_blk; if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. @@ -4408,19 +4771,18 @@ xlog_do_recovery_pass( goto bread_err2; } - error = xlog_unpack_data(rhead, offset, log); + error = xlog_recover_process(log, rhash, rhead, offset, + pass); if (error) goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; blk_no += bblks; + rhead_blk = blk_no; } ASSERT(blk_no >= log->l_logBBsize); blk_no -= log->l_logBBsize; + rhead_blk = blk_no; } /* read first part of physical log */ @@ -4441,21 +4803,22 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = xlog_unpack_data(rhead, offset, log); + error = xlog_recover_process(log, rhash, rhead, offset, pass); if (error) goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; blk_no += bblks + hblks; + rhead_blk = blk_no; } bread_err2: xlog_put_bp(dbp); bread_err1: xlog_put_bp(hbp); + + if (error && first_bad) + *first_bad = rhead_blk; + return error; } @@ -4493,7 +4856,7 @@ xlog_do_log_recovery( INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); error = xlog_do_recovery_pass(log, head_blk, tail_blk, - XLOG_RECOVER_PASS1); + XLOG_RECOVER_PASS1, NULL); if (error != 0) { kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; @@ -4504,7 +4867,7 @@ xlog_do_log_recovery( * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, - XLOG_RECOVER_PASS2); + XLOG_RECOVER_PASS2, NULL); #ifdef DEBUG if (!error) { int i; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ab1bac6a3a1c..be02a68b2fe2 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -766,7 +766,6 @@ xfs_growfs_rt_alloc( { xfs_fileoff_t bno; /* block number in file */ struct xfs_buf *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ xfs_daddr_t d; /* disk block address */ int error; /* error return value */ xfs_fsblock_t firstblock;/* first block allocated in xaction */ @@ -811,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_bmap_finish(&tp, &flist, &committed); + error = xfs_bmap_finish(&tp, &flist, NULL); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 36bd8825bfb0..59c9b7bd958d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -137,7 +137,7 @@ static const match_table_t tokens = { }; -STATIC unsigned long +STATIC int suffix_kstrtoint(char *s, unsigned int base, int *res) { int last, shift_left_factor = 0, _res; @@ -1714,8 +1714,8 @@ xfs_init_zones(void) xfs_inode_zone = kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, - xfs_fs_inode_init_once); + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD | + KM_ZONE_ACCOUNT, xfs_fs_inode_init_once); if (!xfs_inode_zone) goto out_destroy_efi_zone; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 996481eeb491..b44284c1adda 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -178,7 +178,6 @@ xfs_symlink( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - int committed; xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; int nmaps; @@ -387,7 +386,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -434,7 +433,6 @@ xfs_inactive_symlink_rmt( struct xfs_inode *ip) { xfs_buf_t *bp; - int committed; int done; int error; xfs_fsblock_t first_block; @@ -510,16 +508,10 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, ip); if (error) goto error_bmap_cancel; /* - * The transaction must have been committed, since there were - * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. - * The new tp has the extent freeing and EFDs. - */ - ASSERT(committed); - /* * The first xact was committed, so add the inode to the new one. * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index ee70f5dec9dc..641d625eb334 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -255,11 +255,47 @@ write_grant_head_show( } XFS_SYSFS_ATTR_RO(write_grant_head); +#ifdef DEBUG +STATIC ssize_t +log_badcrc_factor_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xlog *log = to_xlog(kobject); + int ret; + uint32_t val; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + log->l_badcrc_factor = val; + + return count; +} + +STATIC ssize_t +log_badcrc_factor_show( + struct kobject *kobject, + char *buf) +{ + struct xlog *log = to_xlog(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor); +} + +XFS_SYSFS_ATTR_RW(log_badcrc_factor); +#endif /* DEBUG */ + static struct attribute *xfs_log_attrs[] = { ATTR_LIST(log_head_lsn), ATTR_LIST(log_tail_lsn), ATTR_LIST(reserve_grant_head), ATTR_LIST(write_grant_head), +#ifdef DEBUG + ATTR_LIST(log_badcrc_factor), +#endif NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 877079eb0f8f..391d797cb53f 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1222,6 +1222,32 @@ DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); +DECLARE_EVENT_CLASS(xfs_readpage_class, + TP_PROTO(struct inode *inode, int nr_pages), + TP_ARGS(inode, nr_pages), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, nr_pages) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->nr_pages = nr_pages; + ), + TP_printk("dev %d:%d ino 0x%llx nr_pages %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->nr_pages) +) + +#define DEFINE_READPAGE_EVENT(name) \ +DEFINE_EVENT(xfs_readpage_class, name, \ + TP_PROTO(struct inode *inode, int nr_pages), \ + TP_ARGS(inode, nr_pages)) +DEFINE_READPAGE_EVENT(xfs_vm_readpage); +DEFINE_READPAGE_EVENT(xfs_vm_readpages); + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int type, struct xfs_bmbt_irec *irec), diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index ce78534a047e..995170194df0 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -572,12 +572,16 @@ xfs_quota_warn( struct xfs_dquot *dqp, int type) { - /* no warnings for project quotas - we just return ENOSPC later */ + enum quota_type qtype; + if (dqp->dq_flags & XFS_DQ_PROJ) - return; - quota_send_warning(make_kqid(&init_user_ns, - (dqp->dq_flags & XFS_DQ_USER) ? - USRQUOTA : GRPQUOTA, + qtype = PRJQUOTA; + else if (dqp->dq_flags & XFS_DQ_USER) + qtype = USRQUOTA; + else + qtype = GRPQUOTA; + + quota_send_warning(make_kqid(&init_user_ns, qtype, be32_to_cpu(dqp->q_core.d_id)), mp->m_super->s_dev, type); } |